--- /dev/null
+
+SHELL=bash
+
+Ns=[1000,10000,25000]
+ms=[50, 100, 250, 500]
+seeds=[$(shell seq -s, 1 250)]
+all:remembr.RDS
+
+srun=srun -A comdata -p compute-bigmem --time=10:00:00 --mem 4G -c 1
+
+example_1_jobs: example_1.R
+ grid_sweep.py --command "Rscript example_1.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_1.feather"]}' --outfile example_1_jobs
+
+example_1.feather: example_1_jobs
+ rm -f example_1.feather
+ sbatch --wait --verbose --array=1-3000 run_simulation.sbatch 0 example_1_jobs
+# sbatch --wait --verbose --array=3001-6001 run_simulation.sbatch 0 example_1_jobs
+
+example_2_jobs: example_2.R
+ grid_sweep.py --command "Rscript example_2.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_2.feather"]}' --outfile example_2_jobs
+
+example_2.feather: example_2_jobs
+ rm -f example_2.feather
+ sbatch --wait --verbose --array=1-3000 run_simulation.sbatch 0 example_2_jobs
+# sbatch --wait --verbose --array=3001-6001 run_simulation.sbatch 0 example_2_jobs
+
+example_2_B_jobs: example_2_B.R
+ grid_sweep.py --command "Rscript example_2_B.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_2_B.feather"]}' --outfile example_2_B_jobs
+
+example_2_B.feather: example_2_B_jobs
+ rm -f example_2_B.feather
+ sbatch --wait --verbose --array=1-3000 run_simulation.sbatch 0 example_2_B_jobs
+
+example_3_jobs: example_3.R
+ grid_sweep.py --command "Rscript example_3.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_3.feather"]}' --outfile example_3_jobs
+
+example_3.feather: example_3_jobs
+ rm -f example_3.feather
+ sbatch --wait --verbose --array=1-3000 run_simulation.sbatch 0 example_3_jobs
+
+
+remembr.RDS:example_2_B.feather example_1.feather example_2.feather example_3.feather
+ ${srun} Rscript plot_example.R --infile example_1.feather --name "plot.df.example.1"
+ ${srun} Rscript plot_example.R --infile example_2.feather --name "plot.df.example.2"
+ ${srun} Rscript plot_example.R --infile example_2_B.feather --name "plot.df.example.2B"
+ ${srun} Rscript plot_example.R --infile example_3.feather --name "plot.df.example.3"
+
+
+
+# sbatch --wait --verbose --array=3001-6001 run_simulation.sbatch 0 example_2_B_jobs
+
+# example_2_B_mecor_jobs:
+# grid_sweep.py --command "Rscript example_2_B_mecor.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_2_B_mecor.feather"]}' --outfile example_2_B_mecor_jobs
+
+# example_2_B_mecor.feather:example_2_B_mecor.R example_2_B_mecor_jobs
+# rm -f example_2_B_mecor.feather
+# sbatch --wait --verbose --array=1-3000 run_simulation.sbatch 0 example_2_B_mecor_jobs
+# sbatch --wait --verbose --array=3001-6001 run_simulation.sbatch 0 example_2_B_mecor_jobs
+
+
+
--- /dev/null
+### EXAMPLE 2: demonstrates how measurement error can lead to a type sign error in a covariate
+### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
+### Even when you include the proxy variable in the regression.
+### But with some ground truth and multiple imputation, you can fix it.
+
+library(argparser)
+library(mecor)
+library(ggplot2)
+library(data.table)
+library(filelock)
+library(arrow)
+library(Amelia)
+library(Zelig)
+library(predictionError)
+
+source("simulation_base.R")
+
+## SETUP:
+### we want to estimate x -> y; x is MAR
+### we have x -> k; k -> w; x -> w is used to predict x via the model w.
+### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments
+### The labels x are binary, but the model provides a continuous predictor
+
+### simulation:
+#### how much power do we get from the model in the first place? (sweeping N and m)
+####
+logistic <- function(x) {1/(1+exp(-1*x))}
+
+simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
+ set.seed(seed)
+
+ ## the true value of x
+
+ g <- rbinom(N, 1, 0.5)
+ xprime <- Bgx * g + rnorm(N,0,1)
+
+ k <- Bkx*xprime + rnorm(N,0,3)
+
+ x <- as.integer(logistic(scale(xprime)) > 0.5)
+
+ y <- Bxy * x + Bgy * g + rnorm(N, 0, 2) + B0
+ df <- data.table(x=x,k=k,y=y,g=g)
+
+ if( m < N){
+ df <- df[sample(nrow(df), m), x.obs := x]
+ } else {
+ df <- df[, x.obs := x]
+ }
+
+ w.model <- glm(x ~ k,df, family=binomial(link='logit'))
+ w <- predict(w.model,data.frame(k=k)) + rnorm(N,0,1)
+ ## y = B0 + B1x + e
+
+ df[,':='(w=w, w_pred = as.integer(w>0.5))]
+ return(df)
+}
+
+
+parser <- arg_parser("Simulate data and fit corrected models")
+parser <- add_argument(parser, "--N", default=1000, help="number of observations of w")
+parser <- add_argument(parser, "--m", default=100, help="m the number of ground truth observations")
+parser <- add_argument(parser, "--seed", default=432, help='seed for the rng')
+parser <- add_argument(parser, "--outfile", help='output file', default='example_2_B.feather')
+args <- parse_args(parser)
+
+
+B0 <- 0
+Bxy <- 0.2
+Bgy <- 0
+Bkx <- 3.2
+Bgx <- 0
+
+df <- simulate_latent_cocause(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed)
+
+outline <- run_simulation(df
+ ,list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=0, 'Bkx'=Bkx, 'Bgx'=0, 'seed'=args$seed))
+
+outfile_lock <- lock(paste0(args$outfile, '_lock'))
+if(file.exists(args$outfile)){
+ logdata <- read_feather(args$outfile)
+ logdata <- rbind(logdata,as.data.table(outline))
+} else {
+ logdata <- as.data.table(outline)
+}
+
+print(outline)
+write_feather(logdata, args$outfile)
+unlock(outfile_lock)
+
+## for(N in Ns){
+## print(N)
+## for(m in ms){
+## if(N>m){
+## for(seed in seeds){
+## rows <- append(rows, list(run_simulation(N, m, B0, Bxy, Bkx, seed)))
+## }
+## }
+## }
+## }
+
+
+## run_simulation <- function(N, m, B0, Bxy, Bkx, seed){
+## result <- list()
+## df <- simulate_latent_cocause(N, m, B0, Bxy, Bkx, seed)
+
+## result <- append(result, list(N=N,
+## m=m,
+## B0=B0,
+## Bxy=Bxy,
+## Bkx=Bkx,
+## seed=seed))
+
+## (correlation <- cor(df$w,df$x,method='spearman'))
+## result <- append(result, list(correlation=correlation))
+
+## (accuracy <- mean(df$x == df$w_pred))
+
+## result <- append(result, list(accuracy=accuracy))
+
+## (model.true <- lm(y ~ x, data=df))
+
+## (cor(resid(model.true),df$w))
+
+## true.ci.Bxy <- confint(model.true)['x',]
+
+## result <- append(result, list(Bxy.est.true=coef(model.true)['x'],
+## Bxy.ci.upper.true = true.ci.Bxy[2],
+## Bxy.ci.lower.true = true.ci.Bxy[1]))
+
+## (model.naive <- lm(y~w, data=df))
+
+## (model.feasible <- lm(y~x.obs,data=df))
+
+## feasible.ci.Bxy <- confint(model.feasible)['x.obs',]
+## result <- append(result, list(Bxy.est.feasible=coef(model.feasible)['x.obs'],
+## Bxy.ci.upper.feasible = feasible.ci.Bxy[2],
+## Bxy.ci.lower.feasible = feasible.ci.Bxy[1]))
+
+
+## naive.ci.Bxy <- confint(model.naive)['w',]
+
+## result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'],
+## Bxy.ci.upper.naive = naive.ci.Bxy[2],
+## Bxy.ci.lower.naive = naive.ci.Bxy[1]))
+
+
+## ## multiple imputation when k is observed
+
+## amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w_pred'), noms=c("x.obs"),lgstc=c('w'))
+## mod.amelia.k <- zelig(y~x.obs, model='ls', data=amelia.out.k$imputations, cite=FALSE)
+## (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE))
+
+## est.x.mi <- coefse['x.obs','Estimate']
+## est.x.se <- coefse['x.obs','Std.Error']
+## result <- append(result,
+## list(Bxy.est.amelia.full = est.x.mi,
+## Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
+## Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
+## ))
+
+
+## ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model.
+## amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k","w_pred"),noms=c("x.obs"),lgstc=c('w'))
+## mod.amelia.nok <- zelig(y~x.obs, model='ls', data=amelia.out.nok$imputations, cite=FALSE)
+## (coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE))
+
+## est.x.mi <- coefse['x.obs','Estimate']
+## est.x.se <- coefse['x.obs','Std.Error']
+## result <- append(result,
+## list(Bxy.est.amelia.nok = est.x.mi,
+## Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se,
+## Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se
+## ))
+
+## p <- v <- train <- rep(0,N)
+## M <- m
+## p[(M+1):(N)] <- 1
+## v[1:(M)] <- 1
+## df <- df[order(x.obs)]
+## y <- df[,y]
+## x <- df[,x.obs]
+## w <- df[,w]
+## (gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=FALSE))
+
+## result <- append(result,
+## list(Bxy.est.gmm = gmm.res$beta[1,1],
+## Bxy.ci.upper.gmm = gmm.res$confint[1,2],
+## Bxy.ci.lower.gmm = gmm.res$confint[1,1]))
+
+## mod.calibrated.mle <- mecor(y ~ MeasError(w, reference = x.obs), df, B=400, method='efficient')
+
+## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',])
+
+## result <- append(result, list(
+## Bxy.est.mecor = mecor.ci['Estimate'],
+## Bxy.upper.mecor = mecor.ci['UCI'],
+## Bxy.lower.mecor = mecor.ci['LCI'])
+## )
+
+
+## return(result)
+## }
+
--- /dev/null
+### EXAMPLE 2: demonstrates how measurement error can lead to a type sign error in a covariate
+### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
+### Even when you include the proxy variable in the regression.
+### But with some ground truth and multiple imputation, you can fix it.
+
+library(argparser)
+library(mecor)
+library(ggplot2)
+library(data.table)
+library(filelock)
+library(arrow)
+library(Amelia)
+library(Zelig)
+library(predictionError)
+options(amelia.parallel="no",
+ amelia.ncpus=1)
+
+## SETUP:
+### we want to estimate x -> y; x is MAR
+### we have x -> k; k -> w; x -> w is used to predict x via the model w.
+### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments
+### The labels x are binary, but the model provides a continuous predictor
+
+### simulation:
+#### how much power do we get from the model in the first place? (sweeping N and m)
+####
+source("simulation_base.R")
+
+simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
+ set.seed(seed)
+
+ ## the true value of x
+
+ g <- rbinom(N, 1, 0.5)
+ k <- rnorm(N, 0, 1)
+ xprime <- Bkx*k + Bgx * g + rnorm(N,0,1)
+ xvec <- scale(xprime)
+
+ y <- Bxy * xvec + Bgy * g + rnorm(N, 0, 1) + B0
+
+ df <- data.table(x=xvec,k=k,y=y,g=g)
+ names(df) <- c('x','k','y','g')
+ if( m < N){
+ df <- df[sample(nrow(df), m), x.obs := x]
+ } else {
+ df <- df[, x.obs := x]
+ }
+
+ w.model <- lm(x ~ k,df)
+ w <- predict(w.model,data.frame(k=k))
+ w <- logistic(w + rnorm(N,0,sd(w)*0.1))
+ ## y = B0 + B1x + e
+
+ df[,':='(w=w, w_pred = as.integer(w>0.5))]
+ return(df)
+}
+
+
+parser <- arg_parser("Simulate data and fit corrected models")
+parser <- add_argument(parser, "--N", default=1000, help="number of observations of w")
+parser <- add_argument(parser, "--m", default=200, help="m the number of ground truth observations")
+parser <- add_argument(parser, "--seed", default=4321, help='seed for the rng')
+parser <- add_argument(parser, "--outfile", help='output file', default='example_2.feather')
+args <- parse_args(parser)
+
+Ns <- c(1000, 10000, 1e6)
+ms <- c(100, 250, 500, 1000)
+B0 <- 0
+Bxy <- 0.2
+Bgy <- -0.2
+Bkx <- 2
+Bgx <- 3
+
+outline <- run_simulation(simulate_latent_cocause(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed)
+ ,list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=args$seed))
+
+outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE)
+if(file.exists(args$outfile)){
+ logdata <- read_feather(args$outfile)
+ logdata <- rbind(logdata,as.data.table(outline))
+} else {
+ logdata <- as.data.table(outline)
+}
+
+print(outline)
+write_feather(logdata, args$outfile)
+unlock(outfile_lock)
--- /dev/null
+### EXAMPLE 2_b: demonstrates how measurement error can lead to a type sign error in a covariate
+### This is the same as example 2, only instead of x->k we have k->x.
+### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
+### Even when you include the proxy variable in the regression.
+### But with some ground truth and multiple imputation, you can fix it.
+
+library(argparser)
+library(mecor)
+library(ggplot2)
+library(data.table)
+library(filelock)
+library(arrow)
+library(Amelia)
+library(Zelig)
+library(predictionError)
+options(amelia.parallel="no",
+ amelia.ncpus=1)
+
+source("simulation_base.R")
+
+## SETUP:
+### we want to estimate x -> y; x is MAR
+### we have x -> k; k -> w; x -> w is used to predict x via the model w.
+### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments
+### The labels x are binary, but the model provides a continuous predictor
+
+### simulation:
+#### how much power do we get from the model in the first place? (sweeping N and m)
+####
+logistic <- function(x) {1/(1+exp(-1*x))}
+
+simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
+ set.seed(seed)
+
+ ## the true value of x
+
+ g <- rbinom(N, 1, 0.5)
+ xprime <- Bgx * g + rnorm(N,0,1)
+
+ k <- Bkx*xprime + rnorm(N,0,3)
+
+ x <- as.integer(logistic(scale(xprime)) > 0.5)
+
+ y <- Bxy * x + Bgy * g + rnorm(N, 0, 2) + B0
+ df <- data.table(x=x,k=k,y=y,g=g)
+
+ if( m < N){
+ df <- df[sample(nrow(df), m), x.obs := x]
+ } else {
+ df <- df[, x.obs := x]
+ }
+
+ w.model <- glm(x ~ k,df, family=binomial(link='logit'))
+ w <- predict(w.model,data.frame(k=k)) + rnorm(N,0,1)
+ ## y = B0 + B1x + e
+
+ df[,':='(w=w, w_pred = as.integer(w>0.5))]
+ return(df)
+}
+
+schennach <- function(df){
+
+ fwx <- glm(x.obs~w, df, family=binomial(link='logit'))
+ df[,xstar_pred := predict(fwx, df)]
+ gxt <- lm(y ~ xstar_pred+g, df)
+
+}
+
+parser <- arg_parser("Simulate data and fit corrected models")
+parser <- add_argument(parser, "--N", default=100, help="number of observations of w")
+parser <- add_argument(parser, "--m", default=20, help="m the number of ground truth observations")
+parser <- add_argument(parser, "--seed", default=4321, help='seed for the rng')
+parser <- add_argument(parser, "--outfile", help='output file', default='example_2_B.feather')
+args <- parse_args(parser)
+
+rows <- list()
+
+B0 <- 0
+Bxy <- 0.2
+Bgy <- -0.2
+Bkx <- 1
+Bgx <- 3
+
+outline <- run_simulation(simulate_latent_cocause(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed)
+ ,list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=args$seed))
+
+outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE)
+if(file.exists(args$outfile)){
+ logdata <- read_feather(args$outfile)
+ logdata <- rbind(logdata,as.data.table(outline))
+} else {
+ logdata <- as.data.table(outline)
+}
+
+print(outline)
+write_feather(logdata, args$outfile)
+unlock(outfile_lock)
+
+## Ns <- c(1e6, 5e4, 1000)
+## ms <- c(100, 250, 500, 1000)
+## seeds <- 1:500
+
+## rowssets <- list()
+## library(doParallel)
+## options(mc.cores = parallel::detectCores())
+## cl <- makeCluster(20)
+## registerDoParallel(cl)
+
+## ## library(future)
+
+## ## plan(multiprocess,workers=40,gc=TRUE)
+
+## for(N in Ns){
+## print(N)
+## for(m in ms){
+## if(N>m){
+## new.rows <- foreach(iter=seeds, .combine=rbind, .packages = c('mecor','Amelia','Zelig','predictionError','data.table'),
+## .export = c("run_simulation","simulate_latent_cocause","logistic","N","m","B0","Bxy","Bgy","Bkx","Bgx")) %dopar%
+
+## {run_simulation(simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, iter)
+## ,list('N'=N,'m'=m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=iter))}
+## rowsets <- append(rowssets, list(data.table(new.rows)))
+## }
+
+## }
+## ## rows <- append(rows, list(future({run_simulation(simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)
+## }
+ ## ,list(N=N,m=m,B0=B0,Bxy=Bxy,Bgy=Bgy, Bkx=Bkx, Bgx=Bgx, seed=seed))w},
+ ## packages=c('mecor','Amelia','Zelig','predictionError'),
+ ## seed=TRUE)))
+
+
+## df <- rbindlist(rowsets)
+
+## write_feather(df,"example_2B_simulation.feather")
+
+## run_simulation <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
+## result <- list()
+## df <- simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)
+
+## result <- append(result, list(N=N,
+## m=m,
+## B0=B0,
+## Bxy=Bxy,
+## Bgy=Bgy,
+## Bkx=Bkx,
+## seed=seed))
+
+## (accuracy <- df[,.(mean(w_pred==x))])
+## result <- append(result, list(accuracy=accuracy))
+
+## (model.true <- lm(y ~ x + g, data=df))
+## true.ci.Bxy <- confint(model.true)['x',]
+## true.ci.Bgy <- confint(model.true)['g',]
+
+## result <- append(result, list(Bxy.est.true=coef(model.true)['x'],
+## Bgy.est.true=coef(model.true)['g'],
+## Bxy.ci.upper.true = true.ci.Bxy[2],
+## Bxy.ci.lower.true = true.ci.Bxy[1],
+## Bgy.ci.upper.true = true.ci.Bgy[2],
+## Bgy.ci.lower.true = true.ci.Bgy[1]))
+
+## (model.feasible <- lm(y~x.obs+g,data=df))
+
+## feasible.ci.Bxy <- confint(model.feasible)['x.obs',]
+## result <- append(result, list(Bxy.est.feasible=coef(model.feasible)['x.obs'],
+## Bxy.ci.upper.feasible = feasible.ci.Bxy[2],
+## Bxy.ci.lower.feasible = feasible.ci.Bxy[1]))
+
+## feasible.ci.Bgy <- confint(model.feasible)['g',]
+## result <- append(result, list(Bgy.est.feasible=coef(model.feasible)['g'],
+## Bgy.ci.upper.feasible = feasible.ci.Bgy[2],
+## Bgy.ci.lower.feasible = feasible.ci.Bgy[1]))
+
+## (model.naive <- lm(y~w+g, data=df))
+
+## naive.ci.Bxy <- confint(model.naive)['w',]
+## naive.ci.Bgy <- confint(model.naive)['g',]
+
+## result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'],
+## Bgy.est.naive=coef(model.naive)['g'],
+## Bxy.ci.upper.naive = naive.ci.Bxy[2],
+## Bxy.ci.lower.naive = naive.ci.Bxy[1],
+## Bgy.ci.upper.naive = naive.ci.Bgy[2],
+## Bgy.ci.lower.naive = naive.ci.Bgy[1]))
+
+
+## ## multiple imputation when k is observed
+## ## amelia does great at this one.
+## amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w_pred'),noms=c("x.obs","g"),lgstc=c('w'))
+## mod.amelia.k <- zelig(y~x.obs+g, model='ls', data=amelia.out.k$imputations, cite=FALSE)
+## (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE))
+
+## est.x.mi <- coefse['x.obs','Estimate']
+## est.x.se <- coefse['x.obs','Std.Error']
+## result <- append(result,
+## list(Bxy.est.amelia.full = est.x.mi,
+## Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
+## Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
+## ))
+
+## est.g.mi <- coefse['g','Estimate']
+## est.g.se <- coefse['g','Std.Error']
+
+## result <- append(result,
+## list(Bgy.est.amelia.full = est.g.mi,
+## Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se,
+## Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se
+## ))
+
+## ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model.
+## amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k"), noms=c("x.obs",'g'),lgstc = c("w"))
+## mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE)
+## (coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE))
+
+## est.x.mi <- coefse['x.obs','Estimate']
+## est.x.se <- coefse['x.obs','Std.Error']
+## result <- append(result,
+## list(Bxy.est.amelia.nok = est.x.mi,
+## Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se,
+## Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se
+## ))
+
+## est.g.mi <- coefse['g','Estimate']
+## est.g.se <- coefse['g','Std.Error']
+
+## result <- append(result,
+## list(Bgy.est.amelia.nok = est.g.mi,
+## Bgy.ci.upper.amelia.nok = est.g.mi + 1.96 * est.g.se,
+## Bgy.ci.lower.amelia.nok = est.g.mi - 1.96 * est.g.se
+## ))
+
+## p <- v <- train <- rep(0,N)
+## M <- m
+## p[(M+1):(N)] <- 1
+## v[1:(M)] <- 1
+## df <- df[order(x.obs)]
+## y <- df[,y]
+## x <- df[,x.obs]
+## g <- df[,g]
+## w <- df[,w]
+## # gmm gets pretty close
+## (gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=FALSE))
+
+## result <- append(result,
+## list(Bxy.est.gmm = gmm.res$beta[1,1],
+## Bxy.ci.upper.gmm = gmm.res$confint[1,2],
+## Bxy.ci.lower.gmm = gmm.res$confint[1,1]))
+
+## result <- append(result,
+## list(Bgy.est.gmm = gmm.res$beta[2,1],
+## Bgy.ci.upper.gmm = gmm.res$confint[2,2],
+## Bgy.ci.lower.gmm = gmm.res$confint[2,1]))
+
+
+## mod.calibrated.mle <- mecor(y ~ MeasError(w, reference = x.obs) + g, df, B=400, method='efficient')
+## (mod.calibrated.mle)
+## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',])
+## result <- append(result, list(
+## Bxy.est.mecor = mecor.ci['Estimate'],
+## Bxy.upper.mecor = mecor.ci['UCI'],
+## Bxy.lower.mecor = mecor.ci['LCI'])
+## )
+
+## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['g',])
+
+## result <- append(result, list(
+## Bxy.est.mecor = mecor.ci['Estimate'],
+## Bxy.upper.mecor = mecor.ci['UCI'],
+## Bxy.lower.mecor = mecor.ci['LCI'])
+## )
+
+
+## return(result)
+## }
+
--- /dev/null
+### EXAMPLE 2_b: demonstrates how measurement error can lead to a type sign error in a covariate
+### This is the same as example 2, only instead of x->k we have k->x.
+### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
+### Even when you include the proxy variable in the regression.
+### But with some ground truth and multiple imputation, you can fix it.
+
+library(argparser)
+library(mecor)
+library(ggplot2)
+library(data.table)
+library(filelock)
+library(arrow)
+library(Amelia)
+library(Zelig)
+library(predictionError)
+options(amelia.parallel="no",
+ amelia.ncpus=1)
+
+source("simulation_base_mecor.R")
+
+## SETUP:
+### we want to estimate x -> y; x is MAR
+### we have x -> k; k -> w; x -> w is used to predict x via the model w.
+### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments
+### The labels x are binary, but the model provides a continuous predictor
+
+### simulation:
+#### how much power do we get from the model in the first place? (sweeping N and m)
+####
+logistic <- function(x) {1/(1+exp(-1*x))}
+
+simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
+ set.seed(seed)
+
+ ## the true value of x
+
+ g <- rbinom(N, 1, 0.5)
+ xprime <- Bgx * g + rnorm(N,0,1)
+
+ k <- Bkx*xprime + rnorm(N,0,3)
+
+ x <- as.integer(logistic(scale(xprime)) > 0.5)
+
+ y <- Bxy * x + Bgy * g + rnorm(N, 0, 1) + B0
+ df <- data.table(x=x,k=k,y=y,g=g)
+
+ if( m < N){
+ df <- df[sample(nrow(df), m), x.obs := x]
+ } else {
+ df <- df[, x.obs := x]
+ }
+
+ w.model <- glm(x ~ k,df, family=binomial(link='logit'))
+ w <- predict(w.model,data.frame(k=k)) + rnorm(N,0,1)
+ ## y = B0 + B1x + e
+
+ df[,':='(w=w, w_pred = as.integer(w>0.5))]
+ return(df)
+}
+
+schennach <- function(df){
+
+ fwx <- glm(x.obs~w, df, family=binomial(link='logit'))
+ df[,xstar_pred := predict(fwx, df)]
+ gxt <- lm(y ~ xstar_pred+g, df)
+
+}
+
+parser <- arg_parser("Simulate data and fit corrected models")
+parser <- add_argument(parser, "--N", default=100, help="number of observations of w")
+parser <- add_argument(parser, "--m", default=20, help="m the number of ground truth observations")
+parser <- add_argument(parser, "--seed", default=4321, help='seed for the rng')
+parser <- add_argument(parser, "--outfile", help='output file', default='example_2_B.feather')
+args <- parse_args(parser)
+
+B0 <- 0
+Bxy <- 0.2
+
+Bkx <- 0.5
+
+rows <- list()
+
+B0 <- 0
+Bxy <- 0.4
+Bgy <- 0.25
+Bkx <- 3.2
+Bgx <- -0.5
+
+outline <- run_simulation(simulate_latent_cocause(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed)
+ ,list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=args$seed))
+
+outfile_lock <- lock(paste0(args$outfile, '_lock'))
+if(file.exists(args$outfile)){
+ logdata <- read_feather(args$outfile)
+ logdata <- rbind(logdata,as.data.table(outline))
+} else {
+ logdata <- as.data.table(outline)
+}
+
+print(outline)
+write_feather(logdata, args$outfile)
+unlock(outfile_lock)
+
+## Ns <- c(1e6, 5e4, 1000)
+## ms <- c(100, 250, 500, 1000)
+## seeds <- 1:500
+
+## rowssets <- list()
+## library(doParallel)
+## options(mc.cores = parallel::detectCores())
+## cl <- makeCluster(20)
+## registerDoParallel(cl)
+
+## ## library(future)
+
+## ## plan(multiprocess,workers=40,gc=TRUE)
+
+## for(N in Ns){
+## print(N)
+## for(m in ms){
+## if(N>m){
+## new.rows <- foreach(iter=seeds, .combine=rbind, .packages = c('mecor','Amelia','Zelig','predictionError','data.table'),
+## .export = c("run_simulation","simulate_latent_cocause","logistic","N","m","B0","Bxy","Bgy","Bkx","Bgx")) %dopar%
+
+## {run_simulation(simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, iter)
+## ,list('N'=N,'m'=m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=iter))}
+## rowsets <- append(rowssets, list(data.table(new.rows)))
+## }
+
+## }
+## ## rows <- append(rows, list(future({run_simulation(simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)
+## }
+ ## ,list(N=N,m=m,B0=B0,Bxy=Bxy,Bgy=Bgy, Bkx=Bkx, Bgx=Bgx, seed=seed))w},
+ ## packages=c('mecor','Amelia','Zelig','predictionError'),
+ ## seed=TRUE)))
+
+
+## df <- rbindlist(rowsets)
+
+## write_feather(df,"example_2B_simulation.feather")
+
+## run_simulation <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
+## result <- list()
+## df <- simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)
+
+## result <- append(result, list(N=N,
+## m=m,
+## B0=B0,
+## Bxy=Bxy,
+## Bgy=Bgy,
+## Bkx=Bkx,
+## seed=seed))
+
+## (accuracy <- df[,.(mean(w_pred==x))])
+## result <- append(result, list(accuracy=accuracy))
+
+## (model.true <- lm(y ~ x + g, data=df))
+## true.ci.Bxy <- confint(model.true)['x',]
+## true.ci.Bgy <- confint(model.true)['g',]
+
+## result <- append(result, list(Bxy.est.true=coef(model.true)['x'],
+## Bgy.est.true=coef(model.true)['g'],
+## Bxy.ci.upper.true = true.ci.Bxy[2],
+## Bxy.ci.lower.true = true.ci.Bxy[1],
+## Bgy.ci.upper.true = true.ci.Bgy[2],
+## Bgy.ci.lower.true = true.ci.Bgy[1]))
+
+## (model.feasible <- lm(y~x.obs+g,data=df))
+
+## feasible.ci.Bxy <- confint(model.feasible)['x.obs',]
+## result <- append(result, list(Bxy.est.feasible=coef(model.feasible)['x.obs'],
+## Bxy.ci.upper.feasible = feasible.ci.Bxy[2],
+## Bxy.ci.lower.feasible = feasible.ci.Bxy[1]))
+
+## feasible.ci.Bgy <- confint(model.feasible)['g',]
+## result <- append(result, list(Bgy.est.feasible=coef(model.feasible)['g'],
+## Bgy.ci.upper.feasible = feasible.ci.Bgy[2],
+## Bgy.ci.lower.feasible = feasible.ci.Bgy[1]))
+
+## (model.naive <- lm(y~w+g, data=df))
+
+## naive.ci.Bxy <- confint(model.naive)['w',]
+## naive.ci.Bgy <- confint(model.naive)['g',]
+
+## result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'],
+## Bgy.est.naive=coef(model.naive)['g'],
+## Bxy.ci.upper.naive = naive.ci.Bxy[2],
+## Bxy.ci.lower.naive = naive.ci.Bxy[1],
+## Bgy.ci.upper.naive = naive.ci.Bgy[2],
+## Bgy.ci.lower.naive = naive.ci.Bgy[1]))
+
+
+## ## multiple imputation when k is observed
+## ## amelia does great at this one.
+## amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w_pred'),noms=c("x.obs","g"),lgstc=c('w'))
+## mod.amelia.k <- zelig(y~x.obs+g, model='ls', data=amelia.out.k$imputations, cite=FALSE)
+## (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE))
+
+## est.x.mi <- coefse['x.obs','Estimate']
+## est.x.se <- coefse['x.obs','Std.Error']
+## result <- append(result,
+## list(Bxy.est.amelia.full = est.x.mi,
+## Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
+## Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
+## ))
+
+## est.g.mi <- coefse['g','Estimate']
+## est.g.se <- coefse['g','Std.Error']
+
+## result <- append(result,
+## list(Bgy.est.amelia.full = est.g.mi,
+## Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se,
+## Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se
+## ))
+
+## ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model.
+## amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k"), noms=c("x.obs",'g'),lgstc = c("w"))
+## mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE)
+## (coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE))
+
+## est.x.mi <- coefse['x.obs','Estimate']
+## est.x.se <- coefse['x.obs','Std.Error']
+## result <- append(result,
+## list(Bxy.est.amelia.nok = est.x.mi,
+## Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se,
+## Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se
+## ))
+
+## est.g.mi <- coefse['g','Estimate']
+## est.g.se <- coefse['g','Std.Error']
+
+## result <- append(result,
+## list(Bgy.est.amelia.nok = est.g.mi,
+## Bgy.ci.upper.amelia.nok = est.g.mi + 1.96 * est.g.se,
+## Bgy.ci.lower.amelia.nok = est.g.mi - 1.96 * est.g.se
+## ))
+
+## p <- v <- train <- rep(0,N)
+## M <- m
+## p[(M+1):(N)] <- 1
+## v[1:(M)] <- 1
+## df <- df[order(x.obs)]
+## y <- df[,y]
+## x <- df[,x.obs]
+## g <- df[,g]
+## w <- df[,w]
+## # gmm gets pretty close
+## (gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=FALSE))
+
+## result <- append(result,
+## list(Bxy.est.gmm = gmm.res$beta[1,1],
+## Bxy.ci.upper.gmm = gmm.res$confint[1,2],
+## Bxy.ci.lower.gmm = gmm.res$confint[1,1]))
+
+## result <- append(result,
+## list(Bgy.est.gmm = gmm.res$beta[2,1],
+## Bgy.ci.upper.gmm = gmm.res$confint[2,2],
+## Bgy.ci.lower.gmm = gmm.res$confint[2,1]))
+
+
+## mod.calibrated.mle <- mecor(y ~ MeasError(w, reference = x.obs) + g, df, B=400, method='efficient')
+## (mod.calibrated.mle)
+## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',])
+## result <- append(result, list(
+## Bxy.est.mecor = mecor.ci['Estimate'],
+## Bxy.upper.mecor = mecor.ci['UCI'],
+## Bxy.lower.mecor = mecor.ci['LCI'])
+## )
+
+## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['g',])
+
+## result <- append(result, list(
+## Bxy.est.mecor = mecor.ci['Estimate'],
+## Bxy.upper.mecor = mecor.ci['UCI'],
+## Bxy.lower.mecor = mecor.ci['LCI'])
+## )
+
+
+## return(result)
+## }
+
--- /dev/null
+### EXAMPLE 2: demonstrates how measurement error can lead to a type sign error in a covariate
+### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
+### Even when you include the proxy variable in the regression.
+### But with some ground truth and multiple imputation, you can fix it.
+
+library(argparser)
+library(mecor)
+library(ggplot2)
+library(data.table)
+library(filelock)
+library(arrow)
+library(Amelia)
+library(Zelig)
+library(predictionError)
+options(amelia.parallel="multicore",
+ amelia.ncpus=40)
+
+## SETUP:
+### we want to estimate g -> y and x -> y; g is observed, x is MAR
+### we have k -> x; g -> x; g->k; k is used to predict x via the model w.
+### we have k -> w; x -> w; w is observed.
+### for illustration, g is binary (e.g., gender==male).
+### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments
+### Whether a comment is "racial harassment" depends on context, like the kind of person (i.e.,) the race of the person making the comment
+### e.g., a Black person saying "n-word" is less likely to be racial harassement than if a white person does it.
+### Say we have a language model that predicts "racial harassment," but it doesn't know the race of the writer.
+### Our content analyzers can see signals of the writer's race (e.g., a profile or avatar). So our "ground truth" takes this into accont.
+### Our goal is to predict an outcome (say that someone gets banned from the platform) as a function of whether they made a racial harassing comment and of their race.
+
+### simulation:
+#### how much power do we get from the model in the first place? (sweeping N and m)
+####
+logistic <- function(x) {1/(1+exp(-1*x))}
+
+simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
+ set.seed(seed)
+
+ ## the true value of x
+
+ g <- rbinom(N, 1, 0.5)
+ k <- rnorm(N, 0, 1)
+ x <- Bkx*k + Bgx * g + rnorm(N,0,1)
+ w.model <- lm(x ~ k)
+ w <- predict(w.model,data.frame(k=k)) + rnorm(N,0,1)
+ ## y = B0 + B1x + e
+ y <- Bxy * x + Bgy * g + rnorm(N, 0, 1) + B0
+ df <- data.table(x=x,k=k,y=y,w=w,g=g)
+ if( m < N){
+ df <- df[sample(nrow(df), m), x.obs := x]
+ } else {
+ df <- df[, x.obs := x]
+ }
+
+ return(df)
+}
+
+
+run_simulation <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
+ result <- list()
+ df <- simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)
+
+ result <- append(result, list(N=N,
+ m=m,
+ B0=B0,
+ Bxy=Bxy,
+ Bgy=Bgy,
+ Bkx=Bkx,
+ seed=seed))
+
+ correlation <- cor(df$w,df$x)
+ result <- append(result, list(correlation=correlation))
+
+ model.true <- lm(y ~ x + g, data=df)
+ true.ci.Bxy <- confint(model.true)['x',]
+ true.ci.Bgy <- confint(model.true)['g',]
+
+ result <- append(result, list(Bxy.est.true=coef(model.true)['x'],
+ Bgy.est.true=coef(model.true)['g'],
+ Bxy.ci.upper.true = true.ci.Bxy[2],
+ Bxy.ci.lower.true = true.ci.Bxy[1],
+ Bgy.ci.upper.true = true.ci.Bgy[2],
+ Bgy.ci.lower.true = true.ci.Bgy[1]))
+
+
+ model.naive <- lm(y~w+g, data=df)
+
+ naive.ci.Bxy <- confint(model.naive)['w',]
+ naive.ci.Bgy <- confint(model.naive)['g',]
+
+ result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'],
+ Bgy.est.naive=coef(model.naive)['g'],
+ Bxy.ci.upper.naive = naive.ci.Bxy[2],
+ Bxy.ci.lower.naive = naive.ci.Bxy[1],
+ Bgy.ci.upper.naive = naive.ci.Bgy[2],
+ Bgy.ci.lower.naive = naive.ci.Bgy[1]))
+
+
+ ## multiple imputation when k is observed
+ amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x'))
+ mod.amelia.k <- zelig(y~x.obs+g+k, model='ls', data=amelia.out.k$imputations, cite=FALSE)
+ coefse <- combine_coef_se(mod.amelia.k, messages=FALSE)
+
+ est.x.mi <- coefse['x.obs','Estimate']
+ est.x.se <- coefse['x.obs','Std.Error']
+ result <- append(result,
+ list(Bxy.est.amelia.full = est.x.mi,
+ Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
+ Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
+ ))
+
+ est.g.mi <- coefse['g','Estimate']
+ est.g.se <- coefse['g','Std.Error']
+
+ result <- append(result,
+ list(Bgy.est.amelia.full = est.g.mi,
+ Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se,
+ Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se
+ ))
+
+ ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model.
+ amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k"))
+ mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE)
+ coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE)
+
+ est.x.mi <- coefse['x.obs','Estimate']
+ est.x.se <- coefse['x.obs','Std.Error']
+ result <- append(result,
+ list(Bxy.est.amelia.nok = est.x.mi,
+ Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se,
+ Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se
+ ))
+
+ est.g.mi <- coefse['g','Estimate']
+ est.g.se <- coefse['g','Std.Error']
+
+ result <- append(result,
+ list(Bgy.est.amelia.nok = est.g.mi,
+ Bgy.ci.upper.amelia.nok = est.g.mi + 1.96 * est.g.se,
+ Bgy.ci.lower.amelia.nok = est.g.mi - 1.96 * est.g.se
+ ))
+
+ p <- v <- train <- rep(0,N)
+ M <- m
+ p[(M+1):(N)] <- 1
+ train[0:(M/2)] <- 1
+ v[(M/2+1):(M)] <- 1
+ df <- df[order(x.obs)]
+ y <- df[,y]
+ x <- df[,x.obs]
+ g <- df[,g]
+ w <- df[,w]
+ gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=FALSE)
+ result <- append(result,
+ list(Bxy.est.gmm = gmm.res$beta[1,1],
+ Bxy.ci.upper.gmm = gmm.res$confint[1,2],
+ Bxy.ci.lower.gmm = gmm.res$confint[1,1],
+ Bgy.est.gmm = gmm.res$beta[2,1],
+ Bgy.ci.upper.gmm = gmm.res$confint[2,2],
+ Bgy.ci.lower.gmm = gmm.res$confint[2,1]))
+
+ return(result)
+}
+
+Ns <- c(100, 200, 300, 400, 500, 1000, 2500, 5000, 7500)
+ms <- c(30, 50, 100, 200, 300, 500)
+B0 <- 0
+Bxy <- 1
+Bgy <- 0.3
+Bkx <- 3
+Bgx <- -4
+seeds <- 1:100
+
+rows <- list()
+
+for(N in Ns){
+ print(N)
+ for(m in ms){
+ if(N>m){
+ for(seed in seeds){
+ rows <- append(rows, list(run_simulation(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)))
+ }
+ }
+ }
+}
+
+result <- rbindlist(rows)
+write_feather(result, "example_2_simulation_continuous.feather")
--- /dev/null
+### EXAMPLE 2_b: demonstrates how measurement error can lead to a type sign error in a covariate
+### What kind of data invalidates fong + tyler?
+### This is the same as example 2, only instead of x->k we have k->x.
+### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
+### Even when you include the proxy variable in the regression.
+### But with some ground truth and multiple imputation, you can fix it.
+
+library(argparser)
+library(mecor)
+library(ggplot2)
+library(data.table)
+library(filelock)
+library(arrow)
+library(Amelia)
+library(Zelig)
+library(predictionError)
+options(amelia.parallel="no",
+ amelia.ncpus=1)
+setDTthreads(40)
+
+source("simulation_base.R")
+
+## SETUP:
+### we want to estimate x -> y; x is MAR
+### we have x -> k; k -> w; x -> w is used to predict x via the model w.
+### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments
+### The labels x are binary, but the model provides a continuous predictor
+
+### simulation:
+#### how much power do we get from the model in the first place? (sweeping N and m)
+####
+
+## one way to do it is by adding correlation to x.obs and y that isn't in w.
+## in other words, the model is missing an important feature of x.obs that's related to y.
+simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
+ set.seed(seed)
+
+ ## the true value of x
+
+ g <- rbinom(N, 1, 0.5)
+
+ # make w and y dependent
+ u <- rnorm(N,0,Bxy)
+
+ xprime <- Bgx * g + rnorm(N,0,1)
+
+ k <- Bkx*xprime + rnorm(N,0,1.5) + 1.1*Bkx*u
+
+ x <- as.integer(logistic(scale(xprime)) > 0.5)
+
+ y <- Bxy * x + Bgy * g + rnorm(N, 0, 1) + B0 + u
+
+ df <- data.table(x=x,k=k,y=y,g=g)
+
+ w.model <- glm(x ~ k,df, family=binomial(link='logit'))
+
+ if( m < N){
+ df <- df[sample(nrow(df), m), x.obs := x]
+ } else {
+ df <- df[, x.obs := x]
+ }
+
+ df[, x.obs := x.obs]
+
+ w <- predict(w.model, df) + rnorm(N, 0, 1)
+ ## y = B0 + B1x + e
+
+ df[,':='(w=w, w_pred = as.integer(w>0.5),u=u)]
+ return(df)
+}
+
+## simulate_latent_cocause_2 <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
+## set.seed(seed)
+
+## ## the true value of x
+
+## g <- rbinom(N, 1, 0.5)
+
+## # make w and y dependent
+## u <- rnorm(N,0,5)
+
+## xprime <- Bgx * g + rnorm(N,0,1)
+
+## k <- Bkx*xprime + rnorm(N,0,3)
+
+## x <- as.integer(logistic(scale(xprime+0.3)) > 0.5)
+
+## y <- Bxy * x + Bgy * g + rnorm(N, 0, 1) + B0 + u
+
+## df <- data.table(x=x,k=k,y=y,g=g)
+
+## w.model <- glm(x ~ k, df, family=binomial(link='logit'))
+
+## if( m < N){
+## df <- df[sample(nrow(df), m), x.obs := x]
+## } else {
+## df <- df[, x.obs := x]
+## }
+
+## w <- predict(w.model,data.frame(k=k)) + u
+## ## y = B0 + B1x + e
+
+## df[,':='(w=w, w_pred = as.integer(w>0.5),u=u)]
+## return(df)
+## }
+
+
+schennach <- function(df){
+
+ fwx <- glm(x.obs~w, df, family=binomial(link='logit'))
+ df[,xstar_pred := predict(fwx, df)]
+ gxt <- lm(y ~ xstar_pred+g, df)
+
+}
+
+
+parser <- arg_parser("Simulate data and fit corrected models")
+parser <- add_argument(parser, "--N", default=5000, help="number of observations of w")
+parser <- add_argument(parser, "--m", default=200, help="m the number of ground truth observations")
+parser <- add_argument(parser, "--seed", default=432, help='seed for the rng')
+parser <- add_argument(parser, "--outfile", help='output file', default='example_2.feather')
+args <- parse_args(parser)
+
+B0 <- 0
+Bxy <- 0.2
+Bgy <- 0
+Bkx <- 2
+Bgx <- 0
+
+
+outline <- run_simulation(simulate_latent_cocause(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed)
+ ,list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=args$seed))
+
+outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE)
+if(file.exists(args$outfile)){
+ logdata <- read_feather(args$outfile)
+ logdata <- rbind(logdata,as.data.table(outline))
+} else {
+ logdata <- as.data.table(outline)
+}
+
+print(outline)
+write_feather(logdata, args$outfile)
+unlock(outfile_lock)
--- /dev/null
+source("RemembR/R/RemembeR.R")
+library(arrow)
+library(data.table)
+library(ggplot2)
+
+df <- data.table(read_feather("example_1.feather"))
+
+x.naive <- df[,.(N, m, Bxy, Bxy.est.naive, Bxy.ci.lower.naive, Bxy.ci.upper.naive)]
+x.naive <- x.naive[,':='(true.in.ci = as.integer((Bxy >= Bxy.ci.lower.naive) & (Bxy <= Bxy.ci.upper.naive)),
+ zero.in.ci = (0 >= Bxy.ci.lower.naive) & (0 <= Bxy.ci.upper.naive),
+ bias = Bxy - Bxy.est.naive,
+ sign.correct = as.integer(sign(Bxy) == sign(Bxy.est.naive)))]
+
+x.naive.plot <- x.naive[,.(p.true.in.ci = mean(true.in.ci),
+ mean.bias = mean(bias),
+ p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ mean.estimate=mean(Bxy.est.naive),
+ var.estimate=var(Bxy.est.naive),
+ Bxy=mean(Bxy),
+ variable='x',
+ method='Naive'
+ ),
+ by=c('N','m')]
+
+x.amelia.full <- df[,.(N, m, Bxy, Bxy.est.true, Bxy.ci.lower.amelia.full, Bxy.ci.upper.amelia.full, Bxy.est.amelia.full)]
+
+x.amelia.full <- x.amelia.full[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.full) & (Bxy.est.true <= Bxy.ci.upper.amelia.full),
+ zero.in.ci = (0 >= Bxy.ci.lower.amelia.full) & (0 <= Bxy.ci.upper.amelia.full),
+ bias = Bxy.est.true - Bxy.est.amelia.full,
+ sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.full))]
+
+x.amelia.full.plot <- x.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+ mean.bias = mean(bias),
+ p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ variable='x',
+ method='Multiple imputation'
+ ),
+ by=c('N','m')]
+
+
+
+g.amelia.full <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.full, Bgy.ci.lower.amelia.full, Bgy.ci.upper.amelia.full)]
+g.amelia.full <- g.amelia.full[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.full) & (Bgy.est.true <= Bgy.ci.upper.amelia.full),
+ zero.in.ci = (0 >= Bgy.ci.lower.amelia.full) & (0 <= Bgy.ci.upper.amelia.full),
+ bias = Bgy.est.amelia.full - Bgy.est.true,
+ sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.full))]
+
+g.amelia.full.plot <- g.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+ mean.bias = mean(bias),
+ p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ variable='g',
+ method='Multiple imputation'
+ ),
+ by=c('N','m')]
+
+x.amelia.nok <- df[,.(N, m, Bxy.est.true, Bxy.est.amelia.nok, Bxy.ci.lower.amelia.nok, Bxy.ci.upper.amelia.nok)]
+x.amelia.nok <- x.amelia.nok[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.nok) & (Bxy.est.true <= Bxy.ci.upper.amelia.nok),
+ zero.in.ci = (0 >= Bxy.ci.lower.amelia.nok) & (0 <= Bxy.ci.upper.amelia.nok),
+ bias = Bxy.est.amelia.nok - Bxy.est.true,
+ sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.nok))]
+
+ x.amelia.nok.plot <- x.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+ mean.bias = mean(bias),
+ p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ variable='x',
+ method='Multiple imputation (Classifier features unobserved)'
+ ),
+ by=c('N','m')]
+
+g.amelia.nok <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.nok, Bgy.ci.lower.amelia.nok, Bgy.ci.upper.amelia.nok)]
+g.amelia.nok <- g.amelia.nok[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.nok) & (Bgy.est.true <= Bgy.ci.upper.amelia.nok),
+ zero.in.ci = (0 >= Bgy.ci.lower.amelia.nok) & (0 <= Bgy.ci.upper.amelia.nok),
+ bias = Bgy.est.amelia.nok - Bgy.est.true,
+ sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.nok))]
+
+g.amelia.nok.plot <- g.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+ mean.bias = mean(bias),
+ p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ variable='g',
+ method='Multiple imputation (Classifier features unobserved)'
+ ),
+ by=c('N','m')]
+
+
+x.mecor <- df[,.(N,m,Bxy.est.true, Bxy.est.mecor,Bxy.lower.mecor, Bxy.upper.mecor)]
+
+x.mecor <- x.mecor[,':='(true.in.ci = (Bxy.est.true >= Bxy.lower.mecor) & (Bxy.est.true <= Bxy.upper.mecor),
+ zero.in.ci = (0 >= Bxy.lower.mecor) & (0 <= Bxy.upper.mecor),
+ bias = Bxy.est.mecor - Bxy.est.true,
+ sign.correct = sign(Bxy.est.true) == sign(Bxy.est.mecor))]
+
+x.mecor.plot <- x.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+ mean.bias = mean(bias),
+ p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ variable='x',
+ method='Regression Calibration'
+ ),
+ by=c('N','m')]
+
+g.mecor <- df[,.(N,m,Bgy.est.true, Bgy.est.mecor,Bgy.lower.mecor, Bgy.upper.mecor)]
+
+g.mecor <- g.mecor[,':='(true.in.ci = (Bgy.est.true >= Bgy.lower.mecor) & (Bgy.est.true <= Bgy.upper.mecor),
+ zero.in.ci = (0 >= Bgy.lower.mecor) & (0 <= Bgy.upper.mecor),
+ bias = Bgy.est.mecor - Bgy.est.true,
+ sign.correct = sign(Bgy.est.true) == sign(Bgy.est.mecor))]
+
+g.mecor.plot <- g.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+ mean.bias = mean(bias),
+ p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ variable='g',
+ method='Regression Calibration'
+ ),
+ by=c('N','m')]
+
+## x.mecor <- df[,.(N,m,Bgy.est.true, Bgy.est.mecor,Bgy.ci.lower.mecor, Bgy.ci.upper.mecor)]
+
+## x.mecor <- x.mecor[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.mecor) & (Bgy.est.true <= Bgy.ci.upper.mecor),
+## zero.in.ci = (0 >= Bgy.ci.lower.mecor) & (0 <= Bgy.ci.upper.mecor),
+## bias = Bgy.est.mecor - Bgy.est.true,
+## sign.correct = sign(Bgy.est.true) == sign(Bgy.est.mecor))]
+
+## x.mecor.plot <- x.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+## mean.bias = mean(bias),
+## p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+## variable='g',
+## method='Regression Calibration'
+## ),
+## by=c('N','m')]
+
+
+x.gmm <- df[,.(N,m,Bxy.est.true, Bxy.est.gmm,Bxy.ci.lower.gmm, Bxy.ci.upper.gmm)]
+x.gmm <- x.gmm[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.gmm) & (Bxy.est.true <= Bxy.ci.upper.gmm),
+ zero.in.ci = (0 >= Bxy.ci.lower.gmm) & (0 <= Bxy.ci.upper.gmm),
+ bias = Bxy.est.gmm - Bxy.est.true,
+ sign.correct = sign(Bxy.est.true) == sign(Bxy.est.gmm))]
+
+x.gmm.plot <- x.gmm[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+ mean.bias = mean(bias),
+ p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ variable='x',
+ method='2SLS+gmm'
+ ),
+ by=c('N','m')]
+
+g.gmm <- df[,.(N,m,Bgy.est.true, Bgy.est.gmm,Bgy.ci.lower.gmm, Bgy.ci.upper.gmm)]
+g.gmm <- g.gmm[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.gmm) & (Bgy.est.true <= Bgy.ci.upper.gmm),
+ zero.in.ci = (0 >= Bgy.ci.lower.gmm) & (0 <= Bgy.ci.upper.gmm),
+ bias = Bgy.est.gmm - Bgy.est.true,
+ sign.correct = sign(Bgy.est.true) == sign(Bgy.est.gmm))]
+
+g.gmm.plot <- g.gmm[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+ mean.bias = mean(bias),
+ p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ variable='g',
+ method='2SLS+gmm'
+ ),
+ by=c('N','m')]
+
+plot.df <- rbindlist(list(x.naive.plot,g.naive.plot,x.amelia.full.plot,g.amelia.full.plot,x.amelia.nok.plot,g.amelia.nok.plot, x.mecor.plot, g.mecor.plot, x.gmm.plot, g.gmm.plot))
+
+remember(plot.df,'example.1.plot.df')
+
+
+ggplot(plot.df,aes(y=N,x=m,color=p.sign.correct)) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='D') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size")
+
+ggplot(plot.df,aes(y=N,x=m,color=abs(mean.bias))) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='D') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size")
+
+ests <- df[,.(Bxy.est.true = mean(Bxy.est.true),
+ Bxy.est.naive = mean(Bxy.est.naive),
+ Bxy.est.feasible = mean(Bxy.est.feasible),
+ Bxy.est.amelia.full = mean(Bxy.est.amelia.full),
+ Bxy.est.amelia.nok = mean(Bxy.est.amelia.nok),
+ Bxy.est.mecor = mean(Bxy.est.mecor),
+ Bxy.est.gmm = mean(Bxy.est.gmm)),
+ by=c("N","m")]
--- /dev/null
+library(arrow)
+library(data.table)
+library(ggplot2)
+
+df <- data.table(read_feather("example_2_simulation.feather"))
+
+x.naive <- df[,.(N, m, Bxy, Bxy.est.naive, Bxy.ci.lower.naive, Bxy.ci.upper.naive)]
+x.naive <- x.naive[,':='(true.in.ci = as.integer((Bxy >= Bxy.ci.lower.naive) & (Bxy <= Bxy.ci.upper.naive)),
+ zero.in.ci = (0 >= Bxy.ci.lower.naive) & (0 <= Bxy.ci.upper.naive),
+ bias = Bxy - Bxy.est.naive,
+ sign.correct = as.integer(sign(Bxy) == sign(Bxy.est.naive)))]
+
+x.naive.plot <- x.naive[,.(p.true.in.ci = mean(true.in.ci),
+ mean.bias = mean(bias),
+ p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ variable='x',
+ method='Naive'
+ ),
+ by=c('N','m')]
+
+g.naive <- df[,.(N, m, Bgy, Bgy.est.naive, Bgy.ci.lower.naive, Bgy.ci.upper.naive)]
+g.naive <- g.naive[,':='(true.in.ci = as.integer((Bgy >= Bgy.ci.lower.naive) & (Bgy <= Bgy.ci.upper.naive)),
+ zero.in.ci = (0 >= Bgy.ci.lower.naive) & (0 <= Bgy.ci.upper.naive),
+ bias = Bgy - Bgy.est.naive,
+ sign.correct = as.integer(sign(Bgy) == sign(Bgy.est.naive)))]
+
+g.naive.plot <- g.naive[,.(p.true.in.ci = mean(true.in.ci),
+ mean.bias = mean(bias),
+ p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ variable='g',
+ method='Naive'
+ ),
+ by=c('N','m')]
+
+
+
+x.amelia.full <- x.amelia.full[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.full) & (Bxy.est.true <= Bxy.ci.upper.amelia.full),
+ zero.in.ci = (0 >= Bxy.ci.lower.amelia.full) & (0 <= Bxy.ci.upper.amelia.full),
+ bias = Bxy.est.true - Bxy.est.amelia.full,
+ sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.full))]
+
+x.amelia.full.plot <- x.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+ mean.bias = mean(bias),
+ p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ variable='x',
+ method='Multiple imputation'
+ ),
+ by=c('N','m')]
+
+
+
+g.amelia.full <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.full, Bgy.ci.lower.amelia.full, Bgy.ci.upper.amelia.full)]
+g.amelia.full <- g.amelia.full[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.full) & (Bgy.est.true <= Bgy.ci.upper.amelia.full),
+ zero.in.ci = (0 >= Bgy.ci.lower.amelia.full) & (0 <= Bgy.ci.upper.amelia.full),
+ bias = Bgy.est.amelia.full - Bgy.est.true,
+ sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.full))]
+
+g.amelia.full.plot <- g.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+ mean.bias = mean(bias),
+ p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ variable='g',
+ method='Multiple imputation'
+ ),
+ by=c('N','m')]
+
+
+
+
+x.amelia.nok <- df[,.(N, m, Bxy.est.true, Bxy.est.amelia.nok, Bxy.ci.lower.amelia.nok, Bxy.ci.upper.amelia.nok)]
+x.amelia.nok <- x.amelia.nok[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.nok) & (Bxy.est.true <= Bxy.ci.upper.amelia.nok),
+ zero.in.ci = (0 >= Bxy.ci.lower.amelia.nok) & (0 <= Bxy.ci.upper.amelia.nok),
+ bias = Bxy.est.amelia.nok - Bxy.est.true,
+ sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.nok))]
+
+ x.amelia.nok.plot <- x.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+ mean.bias = mean(bias),
+ p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ variable='x',
+ method='Multiple imputation (Classifier features unobserved)'
+ ),
+ by=c('N','m')]
+
+g.amelia.nok <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.nok, Bgy.ci.lower.amelia.nok, Bgy.ci.upper.amelia.nok)]
+g.amelia.nok <- g.amelia.nok[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.nok) & (Bgy.est.true <= Bgy.ci.upper.amelia.nok),
+ zero.in.ci = (0 >= Bgy.ci.lower.amelia.nok) & (0 <= Bgy.ci.upper.amelia.nok),
+ bias = Bgy.est.amelia.nok - Bgy.est.true,
+ sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.nok))]
+
+g.amelia.nok.plot <- g.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+ mean.bias = mean(bias),
+ p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ variable='g',
+ method='Multiple imputation (Classifier features unobserved)'
+ ),
+ by=c('N','m')]
+
+
+plot.df <- rbindlist(list(x.naive.plot,g.naive.plot,x.amelia.full.plot,g.amelia.full.plot,x.amelia.nok.plot,g.amelia.nok.plot))
+
+ggplot(plot.df,aes(y=N,x=m,color=p.sign.correct)) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='C') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size")
+
+kggplot(plot.df,aes(y=N,x=m,color=abs(mean.bias))) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='C') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size")
--- /dev/null
+source("RemembR/R/RemembeR.R")
+library(arrow)
+library(data.table)
+library(ggplot2)
+library(filelock)
+library(argparse)
+
+l <- filelock::lock("example_2_B.feather_lock",exclusive=FALSE)
+df <- data.table(read_feather("example_2_B.feather"))
+filelock::unlock(l)
+
+parser <- arg_parser("Simulate data and fit corrected models.")
+parser <- add_argument(parser, "--infile", default="", help="name of the file to read.")
+parser <- add_argument(parser, "--name", default="", help="The name to safe the data to in the remember file.")
+args <- parse_args(parser)
+
+build_plot_dataset <- function(df){
+ x.naive <- df[,.(N, m, Bxy, Bxy.est.naive, Bxy.ci.lower.naive, Bxy.ci.upper.naive)]
+ x.naive <- x.naive[,':='(true.in.ci = as.integer((Bxy >= Bxy.ci.lower.naive) & (Bxy <= Bxy.ci.upper.naive)),
+ zero.in.ci = (0 >= Bxy.ci.lower.naive) & (0 <= Bxy.ci.upper.naive),
+ bias = Bxy - Bxy.est.naive,
+ Bxy.est.naive = Bxy.est.naive,
+ sign.correct = as.integer(sign(Bxy) == sign(Bxy.est.naive)))]
+
+ x.naive.plot <- x.naive[,.(p.true.in.ci = mean(true.in.ci),
+ mean.bias = mean(bias),
+ mean.est = mean(Bxy.est.naive),
+ var.est = var(Bxy.est.naive),
+ N.sims = .N,
+ p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ variable='x',
+ method='Naive'
+ ),
+ by=c('N','m')]
+
+
+ g.naive <- df[,.(N, m, Bgy, Bgy.est.naive, Bgy.ci.lower.naive, Bgy.ci.upper.naive)]
+ g.naive <- g.naive[,':='(true.in.ci = as.integer((Bgy >= Bgy.ci.lower.naive) & (Bgy <= Bgy.ci.upper.naive)),
+ zero.in.ci = (0 >= Bgy.ci.lower.naive) & (0 <= Bgy.ci.upper.naive),
+ bias = Bgy - Bgy.est.naive,
+ Bgy.est.naive = Bgy.est.naive,
+ sign.correct = as.integer(sign(Bgy) == sign(Bgy.est.naive)))]
+
+ g.naive.plot <- g.naive[,.(p.true.in.ci = mean(true.in.ci),
+ mean.bias = mean(bias),
+ mean.est = mean(Bgy.est.naive),
+ var.est = var(Bgy.est.naive),
+ N.sims = .N,
+ p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ variable='g',
+ method='Naive'
+ ),
+ by=c('N','m')]
+
+
+ x.feasible <- df[,.(N, m, Bxy, Bxy.est.feasible, Bxy.ci.lower.feasible, Bxy.ci.upper.feasible)]
+ x.feasible <- x.feasible[,':='(true.in.ci = as.integer((Bxy >= Bxy.ci.lower.feasible) & (Bxy <= Bxy.ci.upper.feasible)),
+ zero.in.ci = (0 >= Bxy.ci.lower.feasible) & (0 <= Bxy.ci.upper.feasible),
+ bias = Bxy - Bxy.est.feasible,
+ Bxy.est.feasible = Bxy.est.feasible,
+ sign.correct = as.integer(sign(Bxy) == sign(Bxy.est.feasible)))]
+
+ x.feasible.plot <- x.feasible[,.(p.true.in.ci = mean(true.in.ci),
+ mean.bias = mean(bias),
+ mean.est = mean(Bxy.est.feasible),
+ var.est = var(Bxy.est.feasible),
+ N.sims = .N,
+ p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ variable='x',
+ method='Feasible'
+ ),
+ by=c('N','m')]
+
+
+ g.feasible <- df[,.(N, m, Bgy, Bgy.est.feasible, Bgy.ci.lower.feasible, Bgy.ci.upper.feasible)]
+ g.feasible <- g.feasible[,':='(true.in.ci = as.integer((Bgy >= Bgy.ci.lower.feasible) & (Bgy <= Bgy.ci.upper.feasible)),
+ zero.in.ci = (0 >= Bgy.ci.lower.feasible) & (0 <= Bgy.ci.upper.feasible),
+ bias = Bgy - Bgy.est.feasible,
+ Bgy.est.feasible = Bgy.est.feasible,
+ sign.correct = as.integer(sign(Bgy) == sign(Bgy.est.feasible)))]
+
+ g.feasible.plot <- g.feasible[,.(p.true.in.ci = mean(true.in.ci),
+ mean.bias = mean(bias),
+ mean.est = mean(Bgy.est.feasible),
+ var.est = var(Bgy.est.feasible),
+ N.sims = .N,
+ p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ variable='g',
+ method='Feasible'
+ ),
+ by=c('N','m')]
+
+
+
+ x.amelia.full <- df[,.(N, m, Bxy, Bxy.est.true, Bxy.ci.lower.amelia.full, Bxy.ci.upper.amelia.full, Bxy.est.amelia.full)]
+
+ x.amelia.full <- x.amelia.full[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.full) & (Bxy.est.true <= Bxy.ci.upper.amelia.full),
+ zero.in.ci = (0 >= Bxy.ci.lower.amelia.full) & (0 <= Bxy.ci.upper.amelia.full),
+ bias = Bxy.est.true - Bxy.est.amelia.full,
+ sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.full))]
+
+ x.amelia.full.plot <- x.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+ mean.bias = mean(bias),
+ p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ mean.est = mean(Bxy.est.amelia.full),
+ var.est = var(Bxy.est.amelia.full),
+ N.sims = .N,
+ variable='x',
+ method='Multiple imputation'
+ ),
+ by=c('N','m')]
+
+
+ g.amelia.full <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.full, Bgy.ci.lower.amelia.full, Bgy.ci.upper.amelia.full)]
+ g.amelia.full <- g.amelia.full[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.full) & (Bgy.est.true <= Bgy.ci.upper.amelia.full),
+ zero.in.ci = (0 >= Bgy.ci.lower.amelia.full) & (0 <= Bgy.ci.upper.amelia.full),
+ bias = Bgy.est.amelia.full - Bgy.est.true,
+ sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.full))]
+
+ g.amelia.full.plot <- g.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+ mean.bias = mean(bias),
+ p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ mean.est = mean(Bgy.est.amelia.full),
+ var.est = var(Bgy.est.amelia.full),
+ N.sims = .N,
+ variable='g',
+ method='Multiple imputation'
+ ),
+ by=c('N','m')]
+
+ x.amelia.nok <- df[,.(N, m, Bxy.est.true, Bxy.est.amelia.nok, Bxy.ci.lower.amelia.nok, Bxy.ci.upper.amelia.nok)]
+ x.amelia.nok <- x.amelia.nok[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.nok) & (Bxy.est.true <= Bxy.ci.upper.amelia.nok),
+ zero.in.ci = (0 >= Bxy.ci.lower.amelia.nok) & (0 <= Bxy.ci.upper.amelia.nok),
+ bias = Bxy.est.amelia.nok - Bxy.est.true,
+ sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.nok))]
+
+ x.amelia.nok.plot <- x.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+ mean.bias = mean(bias),
+ p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ mean.est = mean(Bxy.est.amelia.nok),
+ var.est = var(Bxy.est.amelia.nok),
+ N.sims = .N,
+ variable='x',
+ method='Multiple imputation (Classifier features unobserved)'
+ ),
+ by=c('N','m')]
+
+ g.amelia.nok <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.nok, Bgy.ci.lower.amelia.nok, Bgy.ci.upper.amelia.nok)]
+ g.amelia.nok <- g.amelia.nok[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.nok) & (Bgy.est.true <= Bgy.ci.upper.amelia.nok),
+ zero.in.ci = (0 >= Bgy.ci.lower.amelia.nok) & (0 <= Bgy.ci.upper.amelia.nok),
+ bias = Bgy.est.amelia.nok - Bgy.est.true,
+ sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.nok))]
+
+ g.amelia.nok.plot <- g.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+ mean.bias = mean(bias),
+ p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ mean.est = mean(Bgy.est.amelia.nok),
+ var.est = var(Bgy.est.amelia.nok),
+ N.sims = .N,
+ variable='g',
+ method="Multiple imputation (Classifier features unobserved)"
+ ),
+ by=c('N','m')]
+
+
+ x.mecor <- df[,.(N,m,Bxy.est.true, Bxy.est.mecor,Bxy.lower.mecor, Bxy.upper.mecor)]
+
+ x.mecor <- x.mecor[,':='(true.in.ci = (Bxy.est.true >= Bxy.lower.mecor) & (Bxy.est.true <= Bxy.upper.mecor),
+ zero.in.ci = (0 >= Bxy.lower.mecor) & (0 <= Bxy.upper.mecor),
+ bias = Bxy.est.mecor - Bxy.est.true,
+ sign.correct = sign(Bxy.est.true) == sign(Bxy.est.mecor))]
+
+ x.mecor.plot <- x.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+ mean.bias = mean(bias),
+ p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ mean.est = mean(Bxy.est.mecor),
+ var.est = var(Bxy.est.mecor),
+ N.sims = .N,
+ variable='x',
+ method='Regression Calibration'
+ ),
+ by=c('N','m')]
+
+ g.mecor <- df[,.(N,m,Bgy.est.true, Bgy.est.mecor,Bgy.lower.mecor, Bgy.upper.mecor)]
+
+ g.mecor <- g.mecor[,':='(true.in.ci = (Bgy.est.true >= Bgy.lower.mecor) & (Bgy.est.true <= Bgy.upper.mecor),
+ zero.in.ci = (0 >= Bgy.lower.mecor) & (0 <= Bgy.upper.mecor),
+ bias = Bgy.est.mecor - Bgy.est.true,
+ sign.correct = sign(Bgy.est.true) == sign(Bgy.est.mecor))]
+
+ g.mecor.plot <- g.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+ mean.bias = mean(bias),
+ p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ mean.est = mean(Bgy.est.mecor),
+ var.est = var(Bgy.est.mecor),
+ N.sims = .N,
+ variable='g',
+ method='Regression Calibration'
+ ),
+ by=c('N','m')]
+
+ ## x.mecor <- df[,.(N,m,Bgy.est.true, Bgy.est.mecor,Bgy.ci.lower.mecor, Bgy.ci.upper.mecor)]
+
+ ## x.mecor <- x.mecor[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.mecor) & (Bgy.est.true <= Bgy.ci.upper.mecor),
+ ## zero.in.ci = (0 >= Bgy.ci.lower.mecor) & (0 <= Bgy.ci.upper.mecor),
+ ## bias = Bgy.est.mecor - Bgy.est.true,
+ ## sign.correct = sign(Bgy.est.true) == sign(Bgy.est.mecor))]
+
+ ## x.mecor.plot <- x.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+ ## mean.bias = mean(bias),
+ ## p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ ## variable='g',
+ ## method='Regression Calibration'
+ ## ),
+ ## by=c('N','m')]
+
+
+ x.gmm <- df[,.(N,m,Bxy.est.true, Bxy.est.gmm,Bxy.ci.lower.gmm, Bxy.ci.upper.gmm)]
+ x.gmm <- x.gmm[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.gmm) & (Bxy.est.true <= Bxy.ci.upper.gmm),
+ zero.in.ci = (0 >= Bxy.ci.lower.gmm) & (0 <= Bxy.ci.upper.gmm),
+ bias = Bxy.est.gmm - Bxy.est.true,
+ sign.correct = sign(Bxy.est.true) == sign(Bxy.est.gmm))]
+
+ x.gmm.plot <- x.gmm[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+ mean.bias = mean(bias),
+ p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ mean.est = mean(Bxy.est.gmm),
+ var.est = var(Bxy.est.gmm),
+ N.sims = .N,
+ variable='x',
+ method='2SLS+gmm'
+ ),
+ by=c('N','m')]
+
+ g.gmm <- df[,.(N,m,Bgy.est.true, Bgy.est.gmm,Bgy.ci.lower.gmm, Bgy.ci.upper.gmm)]
+ g.gmm <- g.gmm[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.gmm) & (Bgy.est.true <= Bgy.ci.upper.gmm),
+ zero.in.ci = (0 >= Bgy.ci.lower.gmm) & (0 <= Bgy.ci.upper.gmm),
+ bias = Bgy.est.gmm - Bgy.est.true,
+ sign.correct = sign(Bgy.est.true) == sign(Bgy.est.gmm))]
+
+ g.gmm.plot <- g.gmm[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+ mean.bias = mean(bias),
+ p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ mean.est = mean(Bgy.est.gmm),
+ var.est = var(Bgy.est.gmm),
+ N.sims = .N,
+ variable='g',
+ method='2SLS+gmm'
+ ),
+ by=c('N','m')]
+
+ accuracy <- df[,mean(accuracy)]
+ return(plot.df)
+}
+
+df <- read_feather(args$infile)
+plot.df <- build_plot_dataset(df)
+remember(plot.df,args$name))
+
+
+## df[gmm.ER_pval<0.05]
+
+
+
+## plot.df <- rbindlist(list(x.naive.plot,g.naive.plot,x.amelia.full.plot,g.amelia.full.plot,x.amelia.nok.plot,g.amelia.nok.plot, x.mecor.plot, g.mecor.plot, x.gmm.plot, g.gmm.plot, x.feasible.plot, g.feasible.plot),use.names=T)
+
+## plot.df[,accuracy := accuracy]
+
+## # plot.df <- plot.df[,":="(sd.est=sqrt(var.est)/N.sims)]
+
+
+
+## ggplot(plot.df[variable=='x'], aes(y=mean.est, ymax=mean.est + var.est/2, ymin=mean.est-var.est/2, x=method)) + geom_pointrange() + facet_grid(-m~N) + scale_x_discrete(labels=label_wrap_gen(10))
+
+## ggplot(plot.df,aes(y=N,x=m,color=p.sign.correct)) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='D') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size")
+
+## ggplot(plot.df,aes(y=N,x=m,color=abs(mean.bias))) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='D') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size")
--- /dev/null
+library(arrow)
+library(data.table)
+library(ggplot2)
+
+df <- data.table(read_feather("example_2_simulation.feather"))
+
+x.naive <- df[,.(N, m, Bxy, Bxy.est.naive, Bxy.ci.lower.naive, Bxy.ci.upper.naive)]
+x.naive <- x.naive[,':='(true.in.ci = as.integer((Bxy >= Bxy.ci.lower.naive) & (Bxy <= Bxy.ci.upper.naive)),
+ zero.in.ci = (0 >= Bxy.ci.lower.naive) & (0 <= Bxy.ci.upper.naive),
+ bias = Bxy - Bxy.est.naive,
+ sign.correct = as.integer(sign(Bxy) == sign(Bxy.est.naive)))]
+
+x.naive.plot <- x.naive[,.(p.true.in.ci = mean(true.in.ci),
+ mean.bias = mean(bias),
+ p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ variable='x',
+ method='Naive'
+ ),
+ by=c('N','m')]
+
+g.naive <- df[,.(N, m, Bgy, Bgy.est.naive, Bgy.ci.lower.naive, Bgy.ci.upper.naive)]
+g.naive <- g.naive[,':='(true.in.ci = as.integer((Bgy >= Bgy.ci.lower.naive) & (Bgy <= Bgy.ci.upper.naive)),
+ zero.in.ci = (0 >= Bgy.ci.lower.naive) & (0 <= Bgy.ci.upper.naive),
+ bias = Bgy - Bgy.est.naive,
+ sign.correct = as.integer(sign(Bgy) == sign(Bgy.est.naive)))]
+
+g.naive.plot <- g.naive[,.(p.true.in.ci = mean(true.in.ci),
+ mean.bias = mean(bias),
+ p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ variable='g',
+ method='Naive'
+ ),
+ by=c('N','m')]
+
+
+
+x.amelia.full <- x.amelia.full[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.full) & (Bxy.est.true <= Bxy.ci.upper.amelia.full),
+ zero.in.ci = (0 >= Bxy.ci.lower.amelia.full) & (0 <= Bxy.ci.upper.amelia.full),
+ bias = Bxy.est.true - Bxy.est.amelia.full,
+ sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.full))]
+
+x.amelia.full.plot <- x.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+ mean.bias = mean(bias),
+ p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ variable='x',
+ method='Multiple imputation'
+ ),
+ by=c('N','m')]
+
+
+
+g.amelia.full <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.full, Bgy.ci.lower.amelia.full, Bgy.ci.upper.amelia.full)]
+g.amelia.full <- g.amelia.full[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.full) & (Bgy.est.true <= Bgy.ci.upper.amelia.full),
+ zero.in.ci = (0 >= Bgy.ci.lower.amelia.full) & (0 <= Bgy.ci.upper.amelia.full),
+ bias = Bgy.est.amelia.full - Bgy.est.true,
+ sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.full))]
+
+g.amelia.full.plot <- g.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+ mean.bias = mean(bias),
+ p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ variable='g',
+ method='Multiple imputation'
+ ),
+ by=c('N','m')]
+
+
+
+
+x.amelia.nok <- df[,.(N, m, Bxy.est.true, Bxy.est.amelia.nok, Bxy.ci.lower.amelia.nok, Bxy.ci.upper.amelia.nok)]
+x.amelia.nok <- x.amelia.nok[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.nok) & (Bxy.est.true <= Bxy.ci.upper.amelia.nok),
+ zero.in.ci = (0 >= Bxy.ci.lower.amelia.nok) & (0 <= Bxy.ci.upper.amelia.nok),
+ bias = Bxy.est.amelia.nok - Bxy.est.true,
+ sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.nok))]
+
+ x.amelia.nok.plot <- x.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+ mean.bias = mean(bias),
+ p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ variable='x',
+ method='Multiple imputation (Classifier features unobserved)'
+ ),
+ by=c('N','m')]
+
+g.amelia.nok <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.nok, Bgy.ci.lower.amelia.nok, Bgy.ci.upper.amelia.nok)]
+g.amelia.nok <- g.amelia.nok[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.nok) & (Bgy.est.true <= Bgy.ci.upper.amelia.nok),
+ zero.in.ci = (0 >= Bgy.ci.lower.amelia.nok) & (0 <= Bgy.ci.upper.amelia.nok),
+ bias = Bgy.est.amelia.nok - Bgy.est.true,
+ sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.nok))]
+
+g.amelia.nok.plot <- g.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+ mean.bias = mean(bias),
+ p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+ variable='g',
+ method='Multiple imputation (Classifier features unobserved)'
+ ),
+ by=c('N','m')]
+
+
+plot.df <- rbindlist(list(x.naive.plot,g.naive.plot,x.amelia.full.plot,g.amelia.full.plot,x.amelia.nok.plot,g.amelia.nok.plot))
+
+ggplot(plot.df,aes(y=N,x=m,color=p.sign.correct)) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c() + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size")
+
+ggplot(plot.df,aes(y=N,x=m,color=mean.bias)) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c() + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size")
--- /dev/null
+library(predictionError)
+library(mecor)
+options(amelia.parallel="no",
+ amelia.ncpus=1)
+library(Amelia)
+library(Zelig)
+
+logistic <- function(x) {1/(1+exp(-1*x))}
+
+run_simulation <- function(df, result){
+
+ accuracy <- df[,mean(w_pred==x)]
+ result <- append(result, list(accuracy=accuracy))
+
+ (model.true <- lm(y ~ x + g, data=df))
+ true.ci.Bxy <- confint(model.true)['x',]
+ true.ci.Bgy <- confint(model.true)['g',]
+
+ result <- append(result, list(Bxy.est.true=coef(model.true)['x'],
+ Bgy.est.true=coef(model.true)['g'],
+ Bxy.ci.upper.true = true.ci.Bxy[2],
+ Bxy.ci.lower.true = true.ci.Bxy[1],
+ Bgy.ci.upper.true = true.ci.Bgy[2],
+ Bgy.ci.lower.true = true.ci.Bgy[1]))
+
+ (model.feasible <- lm(y~x.obs+g,data=df))
+
+ feasible.ci.Bxy <- confint(model.feasible)['x.obs',]
+ result <- append(result, list(Bxy.est.feasible=coef(model.feasible)['x.obs'],
+ Bxy.ci.upper.feasible = feasible.ci.Bxy[2],
+ Bxy.ci.lower.feasible = feasible.ci.Bxy[1]))
+
+ feasible.ci.Bgy <- confint(model.feasible)['g',]
+ result <- append(result, list(Bgy.est.feasible=coef(model.feasible)['g'],
+ Bgy.ci.upper.feasible = feasible.ci.Bgy[2],
+ Bgy.ci.lower.feasible = feasible.ci.Bgy[1]))
+
+ (model.naive <- lm(y~w+g, data=df))
+
+ naive.ci.Bxy <- confint(model.naive)['w',]
+ naive.ci.Bgy <- confint(model.naive)['g',]
+
+ result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'],
+ Bgy.est.naive=coef(model.naive)['g'],
+ Bxy.ci.upper.naive = naive.ci.Bxy[2],
+ Bxy.ci.lower.naive = naive.ci.Bxy[1],
+ Bgy.ci.upper.naive = naive.ci.Bgy[2],
+ Bgy.ci.lower.naive = naive.ci.Bgy[1]))
+
+
+ ## multiple imputation when k is observed
+ ## amelia does great at this one.
+ noms <- c()
+ if(length(unique(df$x.obs)) <=2){
+ noms <- c(noms, 'x.obs')
+ }
+
+ if(length(unique(df$g)) <=2){
+ noms <- c(noms, 'g')
+ }
+
+
+ amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w_pred'),noms=noms)
+ mod.amelia.k <- zelig(y~x.obs+g, model='ls', data=amelia.out.k$imputations, cite=FALSE)
+ (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE))
+
+ est.x.mi <- coefse['x.obs','Estimate']
+ est.x.se <- coefse['x.obs','Std.Error']
+ result <- append(result,
+ list(Bxy.est.amelia.full = est.x.mi,
+ Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
+ Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
+ ))
+
+ est.g.mi <- coefse['g','Estimate']
+ est.g.se <- coefse['g','Std.Error']
+
+ result <- append(result,
+ list(Bgy.est.amelia.full = est.g.mi,
+ Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se,
+ Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se
+ ))
+
+ ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model.
+ amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k","w_pred"), noms=noms)
+ mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE)
+ (coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE))
+
+ est.x.mi <- coefse['x.obs','Estimate']
+ est.x.se <- coefse['x.obs','Std.Error']
+ result <- append(result,
+ list(Bxy.est.amelia.nok = est.x.mi,
+ Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se,
+ Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se
+ ))
+
+ est.g.mi <- coefse['g','Estimate']
+ est.g.se <- coefse['g','Std.Error']
+
+ result <- append(result,
+ list(Bgy.est.amelia.nok = est.g.mi,
+ Bgy.ci.upper.amelia.nok = est.g.mi + 1.96 * est.g.se,
+ Bgy.ci.lower.amelia.nok = est.g.mi - 1.96 * est.g.se
+ ))
+
+ N <- nrow(df)
+ m <- nrow(df[!is.na(x.obs)])
+ p <- v <- train <- rep(0,N)
+ M <- m
+ p[(M+1):(N)] <- 1
+ v[1:(M)] <- 1
+ df <- df[order(x.obs)]
+ y <- df[,y]
+ x <- df[,x.obs]
+ g <- df[,g]
+ w <- df[,w]
+ # gmm gets pretty close
+ (gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=TRUE))
+
+ result <- append(result,
+ list(Bxy.est.gmm = gmm.res$beta[1,1],
+ Bxy.ci.upper.gmm = gmm.res$confint[1,2],
+ Bxy.ci.lower.gmm = gmm.res$confint[1,1],
+ gmm.ER_pval = gmm.res$ER_pval
+ ))
+
+ result <- append(result,
+ list(Bgy.est.gmm = gmm.res$beta[2,1],
+ Bgy.ci.upper.gmm = gmm.res$confint[2,2],
+ Bgy.ci.lower.gmm = gmm.res$confint[2,1]))
+
+
+ mod.calibrated.mle <- mecor(y ~ MeasError(w, reference = x.obs) + g, df, B=400, method='efficient')
+ (mod.calibrated.mle)
+ (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',])
+ result <- append(result, list(
+ Bxy.est.mecor = mecor.ci['Estimate'],
+ Bxy.upper.mecor = mecor.ci['UCI'],
+ Bxy.lower.mecor = mecor.ci['LCI'])
+ )
+
+ (mecor.ci <- summary(mod.calibrated.mle)$c$ci['g',])
+
+ result <- append(result, list(
+ Bgy.est.mecor = mecor.ci['Estimate'],
+ Bgy.upper.mecor = mecor.ci['UCI'],
+ Bgy.lower.mecor = mecor.ci['LCI'])
+ )
+
+## clean up memory
+ rm(list=c("df","y","x","g","w","v","train","p","amelia.out.k","amelia.out.nok", "mod.calibrated.mle","gmm.res","mod.amelia.k","mod.amelia.nok", "model.true","model.naive","model.feasible"))
+
+ gc()
+ return(result)
+}