]> code.communitydata.science - ml_measurement_error_public.git/commitdiff
add missing simulation code
authorNathan TeBlunthuis <nathante@uw.edu>
Wed, 15 Jun 2022 03:35:50 +0000 (20:35 -0700)
committerNathan TeBlunthuis <nathante@uw.edu>
Wed, 15 Jun 2022 03:35:50 +0000 (20:35 -0700)
16 files changed:
simulations/Makefile [new file with mode: 0644]
simulations/example_1.R [new file with mode: 0644]
simulations/example_1.feather [new file with mode: 0644]
simulations/example_2.R [new file with mode: 0644]
simulations/example_2_B.R [new file with mode: 0644]
simulations/example_2_B.feather [new file with mode: 0644]
simulations/example_2_B_mecor.R [new file with mode: 0644]
simulations/example_2_binary.R [moved from simulations/mi_simulations/example_2_binary.R with 100% similarity]
simulations/example_2_continuous.R [new file with mode: 0644]
simulations/example_3.R [new file with mode: 0644]
simulations/example_3.feather [new file with mode: 0644]
simulations/plot_example_1.R [new file with mode: 0644]
simulations/plot_example_2.R [new file with mode: 0644]
simulations/plot_example_2_B.R [new file with mode: 0644]
simulations/plot_example_2_continuous.R [new file with mode: 0644]
simulations/simulation_base.R [new file with mode: 0644]

diff --git a/simulations/Makefile b/simulations/Makefile
new file mode 100644 (file)
index 0000000..264b408
--- /dev/null
@@ -0,0 +1,61 @@
+
+SHELL=bash
+
+Ns=[1000,10000,25000]
+ms=[50, 100, 250, 500]
+seeds=[$(shell seq -s, 1 250)]
+all:remembr.RDS
+
+srun=srun -A comdata -p compute-bigmem --time=10:00:00 --mem 4G -c 1
+
+example_1_jobs: example_1.R
+       grid_sweep.py --command "Rscript example_1.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_1.feather"]}' --outfile example_1_jobs
+
+example_1.feather: example_1_jobs
+       rm -f example_1.feather
+       sbatch --wait --verbose --array=1-3000 run_simulation.sbatch 0 example_1_jobs
+#      sbatch --wait --verbose --array=3001-6001 run_simulation.sbatch 0 example_1_jobs
+
+example_2_jobs: example_2.R
+       grid_sweep.py --command "Rscript example_2.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_2.feather"]}' --outfile example_2_jobs
+
+example_2.feather: example_2_jobs
+       rm -f example_2.feather
+       sbatch --wait --verbose --array=1-3000 run_simulation.sbatch 0 example_2_jobs
+#      sbatch --wait --verbose --array=3001-6001 run_simulation.sbatch 0 example_2_jobs
+
+example_2_B_jobs: example_2_B.R
+       grid_sweep.py --command "Rscript example_2_B.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_2_B.feather"]}' --outfile example_2_B_jobs
+
+example_2_B.feather: example_2_B_jobs
+       rm -f example_2_B.feather
+       sbatch --wait --verbose --array=1-3000 run_simulation.sbatch 0 example_2_B_jobs
+
+example_3_jobs: example_3.R
+       grid_sweep.py --command "Rscript example_3.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_3.feather"]}' --outfile example_3_jobs
+
+example_3.feather: example_3_jobs
+       rm -f example_3.feather
+       sbatch --wait --verbose --array=1-3000 run_simulation.sbatch 0 example_3_jobs
+
+
+remembr.RDS:example_2_B.feather example_1.feather example_2.feather example_3.feather
+       ${srun} Rscript plot_example.R --infile example_1.feather --name "plot.df.example.1"
+       ${srun} Rscript plot_example.R --infile example_2.feather --name "plot.df.example.2"
+       ${srun} Rscript plot_example.R --infile example_2_B.feather --name "plot.df.example.2B"
+       ${srun} Rscript plot_example.R --infile example_3.feather --name "plot.df.example.3"
+
+
+
+#      sbatch --wait --verbose --array=3001-6001 run_simulation.sbatch 0 example_2_B_jobs
+
+# example_2_B_mecor_jobs:
+#      grid_sweep.py --command "Rscript example_2_B_mecor.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_2_B_mecor.feather"]}' --outfile example_2_B_mecor_jobs
+
+# example_2_B_mecor.feather:example_2_B_mecor.R example_2_B_mecor_jobs
+#      rm -f example_2_B_mecor.feather
+#      sbatch --wait --verbose --array=1-3000 run_simulation.sbatch 0 example_2_B_mecor_jobs
+#      sbatch --wait --verbose --array=3001-6001 run_simulation.sbatch 0 example_2_B_mecor_jobs
+
+
+
diff --git a/simulations/example_1.R b/simulations/example_1.R
new file mode 100644 (file)
index 0000000..40cdd85
--- /dev/null
@@ -0,0 +1,203 @@
+### EXAMPLE 2: demonstrates how measurement error can lead to a type sign error in a covariate
+### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
+### Even when you include the proxy variable in the regression.
+### But with some ground truth and multiple imputation, you can fix it.
+
+library(argparser)
+library(mecor)
+library(ggplot2)
+library(data.table)
+library(filelock)
+library(arrow)
+library(Amelia)
+library(Zelig)
+library(predictionError)
+
+source("simulation_base.R")
+
+## SETUP:
+### we want to estimate x -> y; x is MAR
+### we have x -> k; k -> w; x -> w is used to predict x via the model w.
+### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments
+### The labels x are binary, but the model provides a continuous predictor
+
+### simulation:
+#### how much power do we get from the model in the first place? (sweeping N and m)
+#### 
+logistic <- function(x) {1/(1+exp(-1*x))}
+
+simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
+    set.seed(seed)
+
+    ## the true value of x
+
+    g <- rbinom(N, 1, 0.5)
+    xprime <- Bgx * g + rnorm(N,0,1)
+
+    k <- Bkx*xprime + rnorm(N,0,3)
+
+    x <- as.integer(logistic(scale(xprime)) > 0.5)
+
+    y <-  Bxy * x  + Bgy * g + rnorm(N, 0, 2) + B0
+    df <- data.table(x=x,k=k,y=y,g=g)
+
+    if( m < N){
+        df <- df[sample(nrow(df), m), x.obs := x]
+    } else {
+        df <- df[, x.obs := x]
+    }
+
+    w.model <- glm(x ~ k,df, family=binomial(link='logit'))
+    w <- predict(w.model,data.frame(k=k)) + rnorm(N,0,1)
+    ## y = B0 + B1x + e
+
+    df[,':='(w=w, w_pred = as.integer(w>0.5))]
+    return(df)
+}
+
+
+parser <- arg_parser("Simulate data and fit corrected models")
+parser <- add_argument(parser, "--N", default=1000, help="number of observations of w")
+parser <- add_argument(parser, "--m", default=100, help="m the number of ground truth observations")
+parser <- add_argument(parser, "--seed", default=432, help='seed for the rng')
+parser <- add_argument(parser, "--outfile", help='output file', default='example_2_B.feather')
+args <- parse_args(parser)
+
+
+B0 <- 0
+Bxy <- 0.2
+Bgy <- 0
+Bkx <- 3.2
+Bgx <- 0
+
+df <- simulate_latent_cocause(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed)
+
+outline <- run_simulation(df
+                         ,list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=0, 'Bkx'=Bkx, 'Bgx'=0, 'seed'=args$seed))
+
+outfile_lock <- lock(paste0(args$outfile, '_lock'))
+if(file.exists(args$outfile)){
+    logdata <- read_feather(args$outfile)
+    logdata <- rbind(logdata,as.data.table(outline))
+} else {
+    logdata <- as.data.table(outline)
+}
+
+print(outline)
+write_feather(logdata, args$outfile)
+unlock(outfile_lock)
+
+## for(N in Ns){
+##     print(N)
+##     for(m in ms){
+##         if(N>m){
+##             for(seed in seeds){
+##                 rows <- append(rows, list(run_simulation(N, m, B0, Bxy, Bkx,  seed)))
+##             }
+##         }
+##     }
+## }
+
+
+## run_simulation <-  function(N, m, B0, Bxy, Bkx, seed){
+##     result <- list()
+##     df <- simulate_latent_cocause(N, m, B0, Bxy, Bkx, seed)
+
+##     result <- append(result, list(N=N,
+##                                   m=m,
+##                                   B0=B0,
+##                                   Bxy=Bxy,
+##                                   Bkx=Bkx,
+##                                   seed=seed))
+
+##     (correlation <- cor(df$w,df$x,method='spearman'))
+##     result <- append(result, list(correlation=correlation))
+
+##     (accuracy <- mean(df$x == df$w_pred))
+
+##     result <- append(result, list(accuracy=accuracy))
+    
+##     (model.true <- lm(y ~ x, data=df))
+
+##     (cor(resid(model.true),df$w))
+
+##     true.ci.Bxy <- confint(model.true)['x',]
+
+##     result <- append(result, list(Bxy.est.true=coef(model.true)['x'],
+##                                   Bxy.ci.upper.true = true.ci.Bxy[2],
+##                                   Bxy.ci.lower.true = true.ci.Bxy[1]))
+
+##     (model.naive <- lm(y~w, data=df))
+
+##     (model.feasible <- lm(y~x.obs,data=df))
+
+##     feasible.ci.Bxy <- confint(model.feasible)['x.obs',]
+##     result <- append(result, list(Bxy.est.feasible=coef(model.feasible)['x.obs'],
+##                                   Bxy.ci.upper.feasible = feasible.ci.Bxy[2],
+##                                   Bxy.ci.lower.feasible = feasible.ci.Bxy[1]))
+
+
+##     naive.ci.Bxy <- confint(model.naive)['w',]
+
+##     result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'],
+##                                   Bxy.ci.upper.naive = naive.ci.Bxy[2],
+##                                   Bxy.ci.lower.naive = naive.ci.Bxy[1]))
+
+
+##     ## multiple imputation when k is observed
+    
+##     amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w_pred'), noms=c("x.obs"),lgstc=c('w'))
+##     mod.amelia.k <- zelig(y~x.obs, model='ls', data=amelia.out.k$imputations, cite=FALSE)
+##     (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE))
+
+##     est.x.mi <- coefse['x.obs','Estimate']
+##     est.x.se <- coefse['x.obs','Std.Error']
+##     result <- append(result,
+##                      list(Bxy.est.amelia.full = est.x.mi,
+##                           Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
+##                           Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
+##                           ))
+
+
+##     ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model.
+##     amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k","w_pred"),noms=c("x.obs"),lgstc=c('w'))
+##     mod.amelia.nok <- zelig(y~x.obs, model='ls', data=amelia.out.nok$imputations, cite=FALSE)
+##     (coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE))
+
+##     est.x.mi <- coefse['x.obs','Estimate']
+##     est.x.se <- coefse['x.obs','Std.Error']
+##     result <- append(result,
+##                      list(Bxy.est.amelia.nok = est.x.mi,
+##                           Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se,
+##                           Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se
+##                           ))
+
+##     p <- v <- train <- rep(0,N)
+##     M <- m
+##     p[(M+1):(N)] <- 1
+##     v[1:(M)] <- 1
+##     df <- df[order(x.obs)]
+##     y <- df[,y]
+##     x <- df[,x.obs]
+##     w <- df[,w]
+##     (gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=FALSE))
+
+##     result <- append(result,
+##                      list(Bxy.est.gmm = gmm.res$beta[1,1],
+##                           Bxy.ci.upper.gmm = gmm.res$confint[1,2],
+##                           Bxy.ci.lower.gmm = gmm.res$confint[1,1]))
+
+##     mod.calibrated.mle <- mecor(y ~ MeasError(w, reference = x.obs), df, B=400, method='efficient')
+
+##     (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',])
+
+##     result <- append(result, list(
+##                                  Bxy.est.mecor = mecor.ci['Estimate'],
+##                                  Bxy.upper.mecor = mecor.ci['UCI'],
+##                                  Bxy.lower.mecor = mecor.ci['LCI'])
+##                      )
+    
+
+##     return(result)
+## }
+
diff --git a/simulations/example_1.feather b/simulations/example_1.feather
new file mode 100644 (file)
index 0000000..b8efa3f
Binary files /dev/null and b/simulations/example_1.feather differ
diff --git a/simulations/example_2.R b/simulations/example_2.R
new file mode 100644 (file)
index 0000000..2ec23f7
--- /dev/null
@@ -0,0 +1,87 @@
+### EXAMPLE 2: demonstrates how measurement error can lead to a type sign error in a covariate
+### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
+### Even when you include the proxy variable in the regression.
+### But with some ground truth and multiple imputation, you can fix it.
+
+library(argparser)
+library(mecor)
+library(ggplot2)
+library(data.table)
+library(filelock)
+library(arrow)
+library(Amelia)
+library(Zelig)
+library(predictionError)
+options(amelia.parallel="no",
+        amelia.ncpus=1)
+
+## SETUP:
+### we want to estimate x -> y; x is MAR
+### we have x -> k; k -> w; x -> w is used to predict x via the model w.
+### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments
+### The labels x are binary, but the model provides a continuous predictor
+
+### simulation:
+#### how much power do we get from the model in the first place? (sweeping N and m)
+#### 
+source("simulation_base.R")
+
+simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
+    set.seed(seed)
+
+    ## the true value of x
+
+    g <- rbinom(N, 1, 0.5)
+    k <- rnorm(N, 0, 1)
+    xprime <- Bkx*k + Bgx * g + rnorm(N,0,1)
+    xvec <- scale(xprime)
+
+    y <-  Bxy * xvec  + Bgy * g + rnorm(N, 0, 1) + B0
+
+    df <- data.table(x=xvec,k=k,y=y,g=g)
+    names(df) <- c('x','k','y','g')
+    if( m < N){
+        df <- df[sample(nrow(df), m), x.obs := x]
+    } else {
+        df <- df[, x.obs := x]
+    }
+
+    w.model <- lm(x ~ k,df)
+    w <- predict(w.model,data.frame(k=k)) 
+    w <- logistic(w + rnorm(N,0,sd(w)*0.1))
+    ## y = B0 + B1x + e
+
+    df[,':='(w=w, w_pred = as.integer(w>0.5))]
+    return(df)
+}
+
+
+parser <- arg_parser("Simulate data and fit corrected models")
+parser <- add_argument(parser, "--N", default=1000, help="number of observations of w")
+parser <- add_argument(parser, "--m", default=200, help="m the number of ground truth observations")
+parser <- add_argument(parser, "--seed", default=4321, help='seed for the rng')
+parser <- add_argument(parser, "--outfile", help='output file', default='example_2.feather')
+args <- parse_args(parser)
+
+Ns <- c(1000, 10000, 1e6)
+ms <- c(100, 250, 500, 1000)
+B0 <- 0
+Bxy <- 0.2
+Bgy <- -0.2
+Bkx <- 2
+Bgx <- 3
+
+outline <- run_simulation(simulate_latent_cocause(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed)
+                         ,list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=args$seed))
+
+outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE)
+if(file.exists(args$outfile)){
+    logdata <- read_feather(args$outfile)
+    logdata <- rbind(logdata,as.data.table(outline))
+} else {
+    logdata <- as.data.table(outline)
+}
+
+print(outline)
+write_feather(logdata, args$outfile)
+unlock(outfile_lock)
diff --git a/simulations/example_2_B.R b/simulations/example_2_B.R
new file mode 100644 (file)
index 0000000..91623a1
--- /dev/null
@@ -0,0 +1,276 @@
+### EXAMPLE 2_b: demonstrates how measurement error can lead to a type sign error in a covariate
+### This is the same as example 2, only instead of x->k we have k->x.
+### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
+### Even when you include the proxy variable in the regression.
+### But with some ground truth and multiple imputation, you can fix it.
+
+library(argparser)
+library(mecor)
+library(ggplot2)
+library(data.table)
+library(filelock)
+library(arrow)
+library(Amelia)
+library(Zelig)
+library(predictionError)
+options(amelia.parallel="no",
+        amelia.ncpus=1)
+
+source("simulation_base.R")
+
+## SETUP:
+### we want to estimate x -> y; x is MAR
+### we have x -> k; k -> w; x -> w is used to predict x via the model w.
+### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments
+### The labels x are binary, but the model provides a continuous predictor
+
+### simulation:
+#### how much power do we get from the model in the first place? (sweeping N and m)
+#### 
+logistic <- function(x) {1/(1+exp(-1*x))}
+
+simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
+    set.seed(seed)
+
+    ## the true value of x
+
+    g <- rbinom(N, 1, 0.5)
+    xprime <- Bgx * g + rnorm(N,0,1)
+
+    k <- Bkx*xprime + rnorm(N,0,3)
+
+    x <- as.integer(logistic(scale(xprime)) > 0.5)
+
+    y <-  Bxy * x  + Bgy * g + rnorm(N, 0, 2) + B0
+    df <- data.table(x=x,k=k,y=y,g=g)
+
+    if( m < N){
+        df <- df[sample(nrow(df), m), x.obs := x]
+    } else {
+        df <- df[, x.obs := x]
+    }
+
+    w.model <- glm(x ~ k,df, family=binomial(link='logit'))
+    w <- predict(w.model,data.frame(k=k)) + rnorm(N,0,1)
+    ## y = B0 + B1x + e
+
+    df[,':='(w=w, w_pred = as.integer(w>0.5))]
+    return(df)
+}
+
+schennach <- function(df){
+
+    fwx <- glm(x.obs~w, df, family=binomial(link='logit'))
+    df[,xstar_pred := predict(fwx, df)]
+    gxt <- lm(y ~ xstar_pred+g, df)
+
+}
+
+parser <- arg_parser("Simulate data and fit corrected models")
+parser <- add_argument(parser, "--N", default=100, help="number of observations of w")
+parser <- add_argument(parser, "--m", default=20, help="m the number of ground truth observations")
+parser <- add_argument(parser, "--seed", default=4321, help='seed for the rng')
+parser <- add_argument(parser, "--outfile", help='output file', default='example_2_B.feather')
+args <- parse_args(parser)
+
+rows <- list()
+
+B0 <- 0
+Bxy <- 0.2
+Bgy <- -0.2
+Bkx <- 1
+Bgx <- 3
+
+outline <- run_simulation(simulate_latent_cocause(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed)
+                         ,list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=args$seed))
+
+outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE)
+if(file.exists(args$outfile)){
+    logdata <- read_feather(args$outfile)
+    logdata <- rbind(logdata,as.data.table(outline))
+} else {
+    logdata <- as.data.table(outline)
+}
+
+print(outline)
+write_feather(logdata, args$outfile)
+unlock(outfile_lock)
+
+## Ns <- c(1e6, 5e4, 1000)
+## ms <- c(100, 250, 500, 1000)
+## seeds <- 1:500
+
+## rowssets <- list()
+## library(doParallel)
+## options(mc.cores = parallel::detectCores())
+## cl <- makeCluster(20)
+## registerDoParallel(cl)
+
+## ## library(future)
+
+## ## plan(multiprocess,workers=40,gc=TRUE)
+
+## for(N in Ns){
+##     print(N)
+##     for(m in ms){
+##         if(N>m){
+##             new.rows <- foreach(iter=seeds, .combine=rbind, .packages = c('mecor','Amelia','Zelig','predictionError','data.table'),
+##                                 .export = c("run_simulation","simulate_latent_cocause","logistic","N","m","B0","Bxy","Bgy","Bkx","Bgx")) %dopar%
+                
+##                 {run_simulation(simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, iter)
+##                                ,list('N'=N,'m'=m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=iter))}
+##             rowsets <- append(rowssets, list(data.table(new.rows)))
+##        }
+
+##     }
+##                 ## rows <- append(rows, list(future({run_simulation(simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)
+## }
+                ##                                                 ,list(N=N,m=m,B0=B0,Bxy=Bxy,Bgy=Bgy, Bkx=Bkx, Bgx=Bgx, seed=seed))w},
+                ##                                  packages=c('mecor','Amelia','Zelig','predictionError'),
+                ##                                  seed=TRUE)))
+
+
+## df <- rbindlist(rowsets)
+
+## write_feather(df,"example_2B_simulation.feather")
+
+## run_simulation <-  function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
+##     result <- list()
+##     df <- simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)
+
+##     result <- append(result, list(N=N,
+##                                   m=m,
+##                                   B0=B0,
+##                                   Bxy=Bxy,
+##                                   Bgy=Bgy,
+##                                   Bkx=Bkx,
+##                                   seed=seed))
+
+##     (accuracy <- df[,.(mean(w_pred==x))])
+##     result <- append(result, list(accuracy=accuracy))
+
+##     (model.true <- lm(y ~ x + g, data=df))
+##     true.ci.Bxy <- confint(model.true)['x',]
+##     true.ci.Bgy <- confint(model.true)['g',]
+
+##     result <- append(result, list(Bxy.est.true=coef(model.true)['x'],
+##                                   Bgy.est.true=coef(model.true)['g'],
+##                                   Bxy.ci.upper.true = true.ci.Bxy[2],
+##                                   Bxy.ci.lower.true = true.ci.Bxy[1],
+##                                   Bgy.ci.upper.true = true.ci.Bgy[2],
+##                                   Bgy.ci.lower.true = true.ci.Bgy[1]))
+                                  
+##     (model.feasible <- lm(y~x.obs+g,data=df))
+
+##     feasible.ci.Bxy <- confint(model.feasible)['x.obs',]
+##     result <- append(result, list(Bxy.est.feasible=coef(model.feasible)['x.obs'],
+##                                   Bxy.ci.upper.feasible = feasible.ci.Bxy[2],
+##                                   Bxy.ci.lower.feasible = feasible.ci.Bxy[1]))
+
+##     feasible.ci.Bgy <- confint(model.feasible)['g',]
+##     result <- append(result, list(Bgy.est.feasible=coef(model.feasible)['g'],
+##                                   Bgy.ci.upper.feasible = feasible.ci.Bgy[2],
+##                                   Bgy.ci.lower.feasible = feasible.ci.Bgy[1]))
+
+##     (model.naive <- lm(y~w+g, data=df))
+    
+##     naive.ci.Bxy <- confint(model.naive)['w',]
+##     naive.ci.Bgy <- confint(model.naive)['g',]
+
+##     result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'],
+##                                   Bgy.est.naive=coef(model.naive)['g'],
+##                                   Bxy.ci.upper.naive = naive.ci.Bxy[2],
+##                                   Bxy.ci.lower.naive = naive.ci.Bxy[1],
+##                                   Bgy.ci.upper.naive = naive.ci.Bgy[2],
+##                                   Bgy.ci.lower.naive = naive.ci.Bgy[1]))
+                                  
+
+##     ## multiple imputation when k is observed
+##     ## amelia does great at this one.
+##     amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w_pred'),noms=c("x.obs","g"),lgstc=c('w'))
+##     mod.amelia.k <- zelig(y~x.obs+g, model='ls', data=amelia.out.k$imputations, cite=FALSE)
+##     (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE))
+
+##     est.x.mi <- coefse['x.obs','Estimate']
+##     est.x.se <- coefse['x.obs','Std.Error']
+##     result <- append(result,
+##                      list(Bxy.est.amelia.full = est.x.mi,
+##                           Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
+##                           Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
+##                           ))
+
+##     est.g.mi <- coefse['g','Estimate']
+##     est.g.se <- coefse['g','Std.Error']
+
+##     result <- append(result,
+##                      list(Bgy.est.amelia.full = est.g.mi,
+##                           Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se,
+##                           Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se
+##                           ))
+
+##     ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model.
+##     amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k"), noms=c("x.obs",'g'),lgstc = c("w"))
+##     mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE)
+##     (coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE))
+
+##     est.x.mi <- coefse['x.obs','Estimate']
+##     est.x.se <- coefse['x.obs','Std.Error']
+##     result <- append(result,
+##                      list(Bxy.est.amelia.nok = est.x.mi,
+##                           Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se,
+##                           Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se
+##                           ))
+
+##     est.g.mi <- coefse['g','Estimate']
+##     est.g.se <- coefse['g','Std.Error']
+
+##     result <- append(result,
+##                      list(Bgy.est.amelia.nok = est.g.mi,
+##                           Bgy.ci.upper.amelia.nok = est.g.mi + 1.96 * est.g.se,
+##                           Bgy.ci.lower.amelia.nok = est.g.mi - 1.96 * est.g.se
+##                           ))
+
+##     p <- v <- train <- rep(0,N)
+##     M <- m
+##     p[(M+1):(N)] <- 1
+##     v[1:(M)] <- 1
+##     df <- df[order(x.obs)]
+##     y <- df[,y]
+##     x <- df[,x.obs]
+##     g <- df[,g]
+##     w <- df[,w]
+##     # gmm gets pretty close
+##     (gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=FALSE))
+
+##     result <- append(result,
+##                      list(Bxy.est.gmm = gmm.res$beta[1,1],
+##                           Bxy.ci.upper.gmm = gmm.res$confint[1,2],
+##                           Bxy.ci.lower.gmm = gmm.res$confint[1,1]))
+
+##     result <- append(result,
+##                      list(Bgy.est.gmm = gmm.res$beta[2,1],
+##                           Bgy.ci.upper.gmm = gmm.res$confint[2,2],
+##                           Bgy.ci.lower.gmm = gmm.res$confint[2,1]))
+
+
+##     mod.calibrated.mle <- mecor(y ~ MeasError(w, reference = x.obs) + g, df, B=400, method='efficient')
+##     (mod.calibrated.mle)
+##     (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',])
+##     result <- append(result, list(
+##                                  Bxy.est.mecor = mecor.ci['Estimate'],
+##                                  Bxy.upper.mecor = mecor.ci['UCI'],
+##                                  Bxy.lower.mecor = mecor.ci['LCI'])
+##                      )
+
+##     (mecor.ci <- summary(mod.calibrated.mle)$c$ci['g',])
+
+##     result <- append(result, list(
+##                                  Bxy.est.mecor = mecor.ci['Estimate'],
+##                                  Bxy.upper.mecor = mecor.ci['UCI'],
+##                                  Bxy.lower.mecor = mecor.ci['LCI'])
+##                      )
+
+
+##     return(result)
+## }
+
diff --git a/simulations/example_2_B.feather b/simulations/example_2_B.feather
new file mode 100644 (file)
index 0000000..8d0ecbd
Binary files /dev/null and b/simulations/example_2_B.feather differ
diff --git a/simulations/example_2_B_mecor.R b/simulations/example_2_B_mecor.R
new file mode 100644 (file)
index 0000000..74ad695
--- /dev/null
@@ -0,0 +1,281 @@
+### EXAMPLE 2_b: demonstrates how measurement error can lead to a type sign error in a covariate
+### This is the same as example 2, only instead of x->k we have k->x.
+### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
+### Even when you include the proxy variable in the regression.
+### But with some ground truth and multiple imputation, you can fix it.
+
+library(argparser)
+library(mecor)
+library(ggplot2)
+library(data.table)
+library(filelock)
+library(arrow)
+library(Amelia)
+library(Zelig)
+library(predictionError)
+options(amelia.parallel="no",
+        amelia.ncpus=1)
+
+source("simulation_base_mecor.R")
+
+## SETUP:
+### we want to estimate x -> y; x is MAR
+### we have x -> k; k -> w; x -> w is used to predict x via the model w.
+### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments
+### The labels x are binary, but the model provides a continuous predictor
+
+### simulation:
+#### how much power do we get from the model in the first place? (sweeping N and m)
+#### 
+logistic <- function(x) {1/(1+exp(-1*x))}
+
+simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
+    set.seed(seed)
+
+    ## the true value of x
+
+    g <- rbinom(N, 1, 0.5)
+    xprime <- Bgx * g + rnorm(N,0,1)
+
+    k <- Bkx*xprime + rnorm(N,0,3)
+
+    x <- as.integer(logistic(scale(xprime)) > 0.5)
+
+    y <-  Bxy * x  + Bgy * g + rnorm(N, 0, 1) + B0
+    df <- data.table(x=x,k=k,y=y,g=g)
+
+    if( m < N){
+        df <- df[sample(nrow(df), m), x.obs := x]
+    } else {
+        df <- df[, x.obs := x]
+    }
+
+    w.model <- glm(x ~ k,df, family=binomial(link='logit'))
+    w <- predict(w.model,data.frame(k=k)) + rnorm(N,0,1)
+    ## y = B0 + B1x + e
+
+    df[,':='(w=w, w_pred = as.integer(w>0.5))]
+    return(df)
+}
+
+schennach <- function(df){
+
+    fwx <- glm(x.obs~w, df, family=binomial(link='logit'))
+    df[,xstar_pred := predict(fwx, df)]
+    gxt <- lm(y ~ xstar_pred+g, df)
+
+}
+
+parser <- arg_parser("Simulate data and fit corrected models")
+parser <- add_argument(parser, "--N", default=100, help="number of observations of w")
+parser <- add_argument(parser, "--m", default=20, help="m the number of ground truth observations")
+parser <- add_argument(parser, "--seed", default=4321, help='seed for the rng')
+parser <- add_argument(parser, "--outfile", help='output file', default='example_2_B.feather')
+args <- parse_args(parser)
+
+B0 <- 0
+Bxy <- 0.2
+
+Bkx <- 0.5
+
+rows <- list()
+
+B0 <- 0
+Bxy <- 0.4
+Bgy <- 0.25
+Bkx <- 3.2
+Bgx <- -0.5
+
+outline <- run_simulation(simulate_latent_cocause(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed)
+                         ,list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=args$seed))
+
+outfile_lock <- lock(paste0(args$outfile, '_lock'))
+if(file.exists(args$outfile)){
+    logdata <- read_feather(args$outfile)
+    logdata <- rbind(logdata,as.data.table(outline))
+} else {
+    logdata <- as.data.table(outline)
+}
+
+print(outline)
+write_feather(logdata, args$outfile)
+unlock(outfile_lock)
+
+## Ns <- c(1e6, 5e4, 1000)
+## ms <- c(100, 250, 500, 1000)
+## seeds <- 1:500
+
+## rowssets <- list()
+## library(doParallel)
+## options(mc.cores = parallel::detectCores())
+## cl <- makeCluster(20)
+## registerDoParallel(cl)
+
+## ## library(future)
+
+## ## plan(multiprocess,workers=40,gc=TRUE)
+
+## for(N in Ns){
+##     print(N)
+##     for(m in ms){
+##         if(N>m){
+##             new.rows <- foreach(iter=seeds, .combine=rbind, .packages = c('mecor','Amelia','Zelig','predictionError','data.table'),
+##                                 .export = c("run_simulation","simulate_latent_cocause","logistic","N","m","B0","Bxy","Bgy","Bkx","Bgx")) %dopar%
+                
+##                 {run_simulation(simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, iter)
+##                                ,list('N'=N,'m'=m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=iter))}
+##             rowsets <- append(rowssets, list(data.table(new.rows)))
+##        }
+
+##     }
+##                 ## rows <- append(rows, list(future({run_simulation(simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)
+## }
+                ##                                                 ,list(N=N,m=m,B0=B0,Bxy=Bxy,Bgy=Bgy, Bkx=Bkx, Bgx=Bgx, seed=seed))w},
+                ##                                  packages=c('mecor','Amelia','Zelig','predictionError'),
+                ##                                  seed=TRUE)))
+
+
+## df <- rbindlist(rowsets)
+
+## write_feather(df,"example_2B_simulation.feather")
+
+## run_simulation <-  function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
+##     result <- list()
+##     df <- simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)
+
+##     result <- append(result, list(N=N,
+##                                   m=m,
+##                                   B0=B0,
+##                                   Bxy=Bxy,
+##                                   Bgy=Bgy,
+##                                   Bkx=Bkx,
+##                                   seed=seed))
+
+##     (accuracy <- df[,.(mean(w_pred==x))])
+##     result <- append(result, list(accuracy=accuracy))
+
+##     (model.true <- lm(y ~ x + g, data=df))
+##     true.ci.Bxy <- confint(model.true)['x',]
+##     true.ci.Bgy <- confint(model.true)['g',]
+
+##     result <- append(result, list(Bxy.est.true=coef(model.true)['x'],
+##                                   Bgy.est.true=coef(model.true)['g'],
+##                                   Bxy.ci.upper.true = true.ci.Bxy[2],
+##                                   Bxy.ci.lower.true = true.ci.Bxy[1],
+##                                   Bgy.ci.upper.true = true.ci.Bgy[2],
+##                                   Bgy.ci.lower.true = true.ci.Bgy[1]))
+                                  
+##     (model.feasible <- lm(y~x.obs+g,data=df))
+
+##     feasible.ci.Bxy <- confint(model.feasible)['x.obs',]
+##     result <- append(result, list(Bxy.est.feasible=coef(model.feasible)['x.obs'],
+##                                   Bxy.ci.upper.feasible = feasible.ci.Bxy[2],
+##                                   Bxy.ci.lower.feasible = feasible.ci.Bxy[1]))
+
+##     feasible.ci.Bgy <- confint(model.feasible)['g',]
+##     result <- append(result, list(Bgy.est.feasible=coef(model.feasible)['g'],
+##                                   Bgy.ci.upper.feasible = feasible.ci.Bgy[2],
+##                                   Bgy.ci.lower.feasible = feasible.ci.Bgy[1]))
+
+##     (model.naive <- lm(y~w+g, data=df))
+    
+##     naive.ci.Bxy <- confint(model.naive)['w',]
+##     naive.ci.Bgy <- confint(model.naive)['g',]
+
+##     result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'],
+##                                   Bgy.est.naive=coef(model.naive)['g'],
+##                                   Bxy.ci.upper.naive = naive.ci.Bxy[2],
+##                                   Bxy.ci.lower.naive = naive.ci.Bxy[1],
+##                                   Bgy.ci.upper.naive = naive.ci.Bgy[2],
+##                                   Bgy.ci.lower.naive = naive.ci.Bgy[1]))
+                                  
+
+##     ## multiple imputation when k is observed
+##     ## amelia does great at this one.
+##     amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w_pred'),noms=c("x.obs","g"),lgstc=c('w'))
+##     mod.amelia.k <- zelig(y~x.obs+g, model='ls', data=amelia.out.k$imputations, cite=FALSE)
+##     (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE))
+
+##     est.x.mi <- coefse['x.obs','Estimate']
+##     est.x.se <- coefse['x.obs','Std.Error']
+##     result <- append(result,
+##                      list(Bxy.est.amelia.full = est.x.mi,
+##                           Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
+##                           Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
+##                           ))
+
+##     est.g.mi <- coefse['g','Estimate']
+##     est.g.se <- coefse['g','Std.Error']
+
+##     result <- append(result,
+##                      list(Bgy.est.amelia.full = est.g.mi,
+##                           Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se,
+##                           Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se
+##                           ))
+
+##     ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model.
+##     amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k"), noms=c("x.obs",'g'),lgstc = c("w"))
+##     mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE)
+##     (coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE))
+
+##     est.x.mi <- coefse['x.obs','Estimate']
+##     est.x.se <- coefse['x.obs','Std.Error']
+##     result <- append(result,
+##                      list(Bxy.est.amelia.nok = est.x.mi,
+##                           Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se,
+##                           Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se
+##                           ))
+
+##     est.g.mi <- coefse['g','Estimate']
+##     est.g.se <- coefse['g','Std.Error']
+
+##     result <- append(result,
+##                      list(Bgy.est.amelia.nok = est.g.mi,
+##                           Bgy.ci.upper.amelia.nok = est.g.mi + 1.96 * est.g.se,
+##                           Bgy.ci.lower.amelia.nok = est.g.mi - 1.96 * est.g.se
+##                           ))
+
+##     p <- v <- train <- rep(0,N)
+##     M <- m
+##     p[(M+1):(N)] <- 1
+##     v[1:(M)] <- 1
+##     df <- df[order(x.obs)]
+##     y <- df[,y]
+##     x <- df[,x.obs]
+##     g <- df[,g]
+##     w <- df[,w]
+##     # gmm gets pretty close
+##     (gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=FALSE))
+
+##     result <- append(result,
+##                      list(Bxy.est.gmm = gmm.res$beta[1,1],
+##                           Bxy.ci.upper.gmm = gmm.res$confint[1,2],
+##                           Bxy.ci.lower.gmm = gmm.res$confint[1,1]))
+
+##     result <- append(result,
+##                      list(Bgy.est.gmm = gmm.res$beta[2,1],
+##                           Bgy.ci.upper.gmm = gmm.res$confint[2,2],
+##                           Bgy.ci.lower.gmm = gmm.res$confint[2,1]))
+
+
+##     mod.calibrated.mle <- mecor(y ~ MeasError(w, reference = x.obs) + g, df, B=400, method='efficient')
+##     (mod.calibrated.mle)
+##     (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',])
+##     result <- append(result, list(
+##                                  Bxy.est.mecor = mecor.ci['Estimate'],
+##                                  Bxy.upper.mecor = mecor.ci['UCI'],
+##                                  Bxy.lower.mecor = mecor.ci['LCI'])
+##                      )
+
+##     (mecor.ci <- summary(mod.calibrated.mle)$c$ci['g',])
+
+##     result <- append(result, list(
+##                                  Bxy.est.mecor = mecor.ci['Estimate'],
+##                                  Bxy.upper.mecor = mecor.ci['UCI'],
+##                                  Bxy.lower.mecor = mecor.ci['LCI'])
+##                      )
+
+
+##     return(result)
+## }
+
diff --git a/simulations/example_2_continuous.R b/simulations/example_2_continuous.R
new file mode 100644 (file)
index 0000000..3ab63b5
--- /dev/null
@@ -0,0 +1,187 @@
+### EXAMPLE 2: demonstrates how measurement error can lead to a type sign error in a covariate
+### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
+### Even when you include the proxy variable in the regression.
+### But with some ground truth and multiple imputation, you can fix it.
+
+library(argparser)
+library(mecor)
+library(ggplot2)
+library(data.table)
+library(filelock)
+library(arrow)
+library(Amelia)
+library(Zelig)
+library(predictionError)
+options(amelia.parallel="multicore",
+        amelia.ncpus=40)
+
+## SETUP:
+### we want to estimate g -> y and x -> y; g is observed, x is MAR
+### we have k -> x; g -> x; g->k; k is used to predict x via the model w.
+### we have k -> w; x -> w; w is observed.
+### for illustration, g is binary (e.g., gender==male).
+### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments
+### Whether a comment is "racial harassment" depends on context, like the kind of person (i.e.,) the race of the person making the comment
+### e.g., a Black person saying "n-word" is less likely to be racial harassement than if a white person does it.
+### Say we have a language model that predicts "racial harassment," but it doesn't know the race of the writer.
+### Our content analyzers can see signals of the writer's race (e.g., a profile or avatar). So our "ground truth" takes this into accont.
+### Our goal is to predict an outcome (say that someone gets banned from the platform) as a function of whether they made a racial harassing comment and of their race. 
+
+### simulation:
+#### how much power do we get from the model in the first place? (sweeping N and m)
+#### 
+logistic <- function(x) {1/(1+exp(-1*x))}
+
+simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
+    set.seed(seed)
+
+    ## the true value of x
+
+    g <- rbinom(N, 1, 0.5)
+    k <- rnorm(N, 0, 1)
+    x <- Bkx*k + Bgx * g + rnorm(N,0,1)
+    w.model <- lm(x ~ k)
+    w <- predict(w.model,data.frame(k=k)) + rnorm(N,0,1)
+    ## y = B0 + B1x + e
+    y <-  Bxy * x  + Bgy * g + rnorm(N, 0, 1) + B0
+    df <- data.table(x=x,k=k,y=y,w=w,g=g)
+    if( m < N){
+        df <- df[sample(nrow(df), m), x.obs := x]
+    } else {
+        df <- df[, x.obs := x]
+    }
+
+    return(df)
+}
+
+
+run_simulation <-  function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
+    result <- list()
+    df <- simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)
+
+    result <- append(result, list(N=N,
+                                  m=m,
+                                  B0=B0,
+                                  Bxy=Bxy,
+                                  Bgy=Bgy,
+                                  Bkx=Bkx,
+                                  seed=seed))
+
+    correlation <- cor(df$w,df$x)
+    result <- append(result, list(correlation=correlation))
+
+    model.true <- lm(y ~ x + g, data=df)
+    true.ci.Bxy <- confint(model.true)['x',]
+    true.ci.Bgy <- confint(model.true)['g',]
+
+    result <- append(result, list(Bxy.est.true=coef(model.true)['x'],
+                                  Bgy.est.true=coef(model.true)['g'],
+                                  Bxy.ci.upper.true = true.ci.Bxy[2],
+                                  Bxy.ci.lower.true = true.ci.Bxy[1],
+                                  Bgy.ci.upper.true = true.ci.Bgy[2],
+                                  Bgy.ci.lower.true = true.ci.Bgy[1]))
+                                  
+
+    model.naive <- lm(y~w+g, data=df)
+    
+    naive.ci.Bxy <- confint(model.naive)['w',]
+    naive.ci.Bgy <- confint(model.naive)['g',]
+
+    result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'],
+                                  Bgy.est.naive=coef(model.naive)['g'],
+                                  Bxy.ci.upper.naive = naive.ci.Bxy[2],
+                                  Bxy.ci.lower.naive = naive.ci.Bxy[1],
+                                  Bgy.ci.upper.naive = naive.ci.Bgy[2],
+                                  Bgy.ci.lower.naive = naive.ci.Bgy[1]))
+                                  
+
+    ## multiple imputation when k is observed
+    amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x'))
+    mod.amelia.k <- zelig(y~x.obs+g+k, model='ls', data=amelia.out.k$imputations, cite=FALSE)
+    coefse <- combine_coef_se(mod.amelia.k, messages=FALSE)
+
+    est.x.mi <- coefse['x.obs','Estimate']
+    est.x.se <- coefse['x.obs','Std.Error']
+    result <- append(result,
+                     list(Bxy.est.amelia.full = est.x.mi,
+                          Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
+                          Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
+                          ))
+
+    est.g.mi <- coefse['g','Estimate']
+    est.g.se <- coefse['g','Std.Error']
+
+    result <- append(result,
+                     list(Bgy.est.amelia.full = est.g.mi,
+                          Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se,
+                          Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se
+                          ))
+
+    ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model.
+    amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k"))
+    mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE)
+    coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE)
+
+    est.x.mi <- coefse['x.obs','Estimate']
+    est.x.se <- coefse['x.obs','Std.Error']
+    result <- append(result,
+                     list(Bxy.est.amelia.nok = est.x.mi,
+                          Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se,
+                          Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se
+                          ))
+
+    est.g.mi <- coefse['g','Estimate']
+    est.g.se <- coefse['g','Std.Error']
+
+    result <- append(result,
+                     list(Bgy.est.amelia.nok = est.g.mi,
+                          Bgy.ci.upper.amelia.nok = est.g.mi + 1.96 * est.g.se,
+                          Bgy.ci.lower.amelia.nok = est.g.mi - 1.96 * est.g.se
+                          ))
+
+    p <- v <- train <- rep(0,N)
+    M <- m
+    p[(M+1):(N)] <- 1
+    train[0:(M/2)] <- 1
+    v[(M/2+1):(M)] <- 1
+    df <- df[order(x.obs)]
+    y <- df[,y]
+    x <- df[,x.obs]
+    g <- df[,g]
+    w <- df[,w]
+    gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=FALSE)
+    result <- append(result,
+                     list(Bxy.est.gmm = gmm.res$beta[1,1],
+                          Bxy.ci.upper.gmm = gmm.res$confint[1,2],
+                          Bxy.ci.lower.gmm = gmm.res$confint[1,1],
+                          Bgy.est.gmm = gmm.res$beta[2,1],
+                          Bgy.ci.upper.gmm = gmm.res$confint[2,2],
+                          Bgy.ci.lower.gmm = gmm.res$confint[2,1]))
+
+    return(result)
+}
+
+Ns <- c(100, 200, 300, 400, 500, 1000, 2500, 5000, 7500)
+ms <- c(30, 50, 100, 200, 300, 500)
+B0 <- 0
+Bxy <- 1
+Bgy <- 0.3
+Bkx <- 3
+Bgx <- -4
+seeds <- 1:100
+
+rows <- list()
+
+for(N in Ns){
+    print(N)
+    for(m in ms){
+        if(N>m){
+            for(seed in seeds){
+                rows <- append(rows, list(run_simulation(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)))
+            }
+        }
+    }
+}
+
+result <- rbindlist(rows)
+write_feather(result, "example_2_simulation_continuous.feather")
diff --git a/simulations/example_3.R b/simulations/example_3.R
new file mode 100644 (file)
index 0000000..389dba1
--- /dev/null
@@ -0,0 +1,144 @@
+### EXAMPLE 2_b: demonstrates how measurement error can lead to a type sign error in a covariate
+### What kind of data invalidates fong + tyler?
+### This is the same as example 2, only instead of x->k we have k->x.
+### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
+### Even when you include the proxy variable in the regression.
+### But with some ground truth and multiple imputation, you can fix it.
+
+library(argparser)
+library(mecor)
+library(ggplot2)
+library(data.table)
+library(filelock)
+library(arrow)
+library(Amelia)
+library(Zelig)
+library(predictionError)
+options(amelia.parallel="no",
+        amelia.ncpus=1)
+setDTthreads(40)
+
+source("simulation_base.R")
+
+## SETUP:
+### we want to estimate x -> y; x is MAR
+### we have x -> k; k -> w; x -> w is used to predict x via the model w.
+### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments
+### The labels x are binary, but the model provides a continuous predictor
+
+### simulation:
+#### how much power do we get from the model in the first place? (sweeping N and m)
+#### 
+
+## one way to do it is by adding correlation to x.obs and y that isn't in w.
+## in other words, the model is missing an important feature of x.obs that's related to y.
+simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
+    set.seed(seed)
+
+    ## the true value of x
+
+    g <- rbinom(N, 1, 0.5)
+
+    # make w and y dependent
+    u <- rnorm(N,0,Bxy)
+
+    xprime <- Bgx * g + rnorm(N,0,1) 
+
+    k <- Bkx*xprime + rnorm(N,0,1.5) + 1.1*Bkx*u
+
+    x <- as.integer(logistic(scale(xprime)) > 0.5)
+
+    y <-  Bxy * x  + Bgy * g + rnorm(N, 0, 1) + B0 + u
+
+    df <- data.table(x=x,k=k,y=y,g=g)
+
+    w.model <- glm(x ~ k,df, family=binomial(link='logit')) 
+
+    if( m < N){
+        df <- df[sample(nrow(df), m), x.obs := x]
+    } else {
+        df <- df[, x.obs := x]
+    }
+
+    df[, x.obs := x.obs]
+
+    w <- predict(w.model, df) + rnorm(N, 0, 1)
+    ## y = B0 + B1x + e
+
+    df[,':='(w=w, w_pred = as.integer(w>0.5),u=u)]
+    return(df)
+}
+
+## simulate_latent_cocause_2 <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
+##     set.seed(seed)
+
+##     ## the true value of x
+
+##     g <- rbinom(N, 1, 0.5)
+
+##     # make w and y dependent
+##     u <- rnorm(N,0,5)
+
+##     xprime <- Bgx * g + rnorm(N,0,1)
+
+##     k <- Bkx*xprime + rnorm(N,0,3)
+
+##     x <- as.integer(logistic(scale(xprime+0.3)) > 0.5)
+
+##     y <-  Bxy * x  + Bgy * g + rnorm(N, 0, 1) + B0 + u
+
+##     df <- data.table(x=x,k=k,y=y,g=g)
+
+##     w.model <- glm(x ~ k, df, family=binomial(link='logit')) 
+
+##     if( m < N){
+##         df <- df[sample(nrow(df), m), x.obs := x]
+##     } else {
+##         df <- df[, x.obs := x]
+##     }
+
+##     w <- predict(w.model,data.frame(k=k)) + u 
+##     ## y = B0 + B1x + e
+
+##     df[,':='(w=w, w_pred = as.integer(w>0.5),u=u)]
+##     return(df)
+## }
+
+
+schennach <- function(df){
+
+    fwx <- glm(x.obs~w, df, family=binomial(link='logit'))
+    df[,xstar_pred := predict(fwx, df)]
+    gxt <- lm(y ~ xstar_pred+g, df)
+
+}
+
+
+parser <- arg_parser("Simulate data and fit corrected models")
+parser <- add_argument(parser, "--N", default=5000, help="number of observations of w")
+parser <- add_argument(parser, "--m", default=200, help="m the number of ground truth observations")
+parser <- add_argument(parser, "--seed", default=432, help='seed for the rng')
+parser <- add_argument(parser, "--outfile", help='output file', default='example_2.feather')
+args <- parse_args(parser)
+
+B0 <- 0
+Bxy <- 0.2
+Bgy <- 0
+Bkx <- 2
+Bgx <- 0
+
+
+outline <- run_simulation(simulate_latent_cocause(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed)
+                         ,list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=args$seed))
+
+outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE)
+if(file.exists(args$outfile)){
+    logdata <- read_feather(args$outfile)
+    logdata <- rbind(logdata,as.data.table(outline))
+} else {
+    logdata <- as.data.table(outline)
+}
+
+print(outline)
+write_feather(logdata, args$outfile)
+unlock(outfile_lock)
diff --git a/simulations/example_3.feather b/simulations/example_3.feather
new file mode 100644 (file)
index 0000000..f130915
Binary files /dev/null and b/simulations/example_3.feather differ
diff --git a/simulations/plot_example_1.R b/simulations/plot_example_1.R
new file mode 100644 (file)
index 0000000..d883d24
--- /dev/null
@@ -0,0 +1,175 @@
+source("RemembR/R/RemembeR.R")
+library(arrow)
+library(data.table)
+library(ggplot2)
+
+df <- data.table(read_feather("example_1.feather"))
+
+x.naive <- df[,.(N, m, Bxy, Bxy.est.naive, Bxy.ci.lower.naive, Bxy.ci.upper.naive)]
+x.naive <- x.naive[,':='(true.in.ci = as.integer((Bxy >= Bxy.ci.lower.naive) & (Bxy <= Bxy.ci.upper.naive)),
+                         zero.in.ci = (0 >= Bxy.ci.lower.naive) & (0 <= Bxy.ci.upper.naive),
+                         bias = Bxy - Bxy.est.naive,
+                         sign.correct = as.integer(sign(Bxy) == sign(Bxy.est.naive)))]
+
+x.naive.plot <- x.naive[,.(p.true.in.ci = mean(true.in.ci),
+                           mean.bias = mean(bias),
+                           p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+                           mean.estimate=mean(Bxy.est.naive),
+                           var.estimate=var(Bxy.est.naive),
+                           Bxy=mean(Bxy),
+                           variable='x',
+                           method='Naive'
+                           ),
+                        by=c('N','m')]
+
+x.amelia.full <- df[,.(N, m, Bxy, Bxy.est.true, Bxy.ci.lower.amelia.full, Bxy.ci.upper.amelia.full, Bxy.est.amelia.full)]
+
+x.amelia.full <- x.amelia.full[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.full) & (Bxy.est.true <= Bxy.ci.upper.amelia.full),
+                                     zero.in.ci = (0 >= Bxy.ci.lower.amelia.full) & (0 <= Bxy.ci.upper.amelia.full),
+                                     bias = Bxy.est.true - Bxy.est.amelia.full,
+                                     sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.full))]
+
+x.amelia.full.plot <- x.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+                                       mean.bias = mean(bias),
+                                       p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+                                       variable='x',
+                                       method='Multiple imputation'
+                                       ),
+                                    by=c('N','m')]
+
+
+
+g.amelia.full <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.full, Bgy.ci.lower.amelia.full, Bgy.ci.upper.amelia.full)]
+g.amelia.full <- g.amelia.full[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.full) & (Bgy.est.true <= Bgy.ci.upper.amelia.full),
+                                     zero.in.ci = (0 >= Bgy.ci.lower.amelia.full) & (0 <= Bgy.ci.upper.amelia.full),
+                                     bias =  Bgy.est.amelia.full - Bgy.est.true,
+                                     sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.full))]
+
+g.amelia.full.plot <- g.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+                                       mean.bias = mean(bias),
+                                       p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+                                       variable='g',
+                                       method='Multiple imputation'
+                                       ),
+                                    by=c('N','m')]
+
+x.amelia.nok <- df[,.(N, m, Bxy.est.true, Bxy.est.amelia.nok, Bxy.ci.lower.amelia.nok, Bxy.ci.upper.amelia.nok)]
+x.amelia.nok <- x.amelia.nok[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.nok) & (Bxy.est.true <= Bxy.ci.upper.amelia.nok),
+                                     zero.in.ci = (0 >= Bxy.ci.lower.amelia.nok) & (0 <= Bxy.ci.upper.amelia.nok),
+                                     bias =  Bxy.est.amelia.nok - Bxy.est.true,
+                                     sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.nok))]
+
+ x.amelia.nok.plot <- x.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+                                       mean.bias = mean(bias),
+                                     p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+                                     variable='x',
+                                     method='Multiple imputation (Classifier features unobserved)'
+                                     ),
+                                    by=c('N','m')]
+
+g.amelia.nok <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.nok, Bgy.ci.lower.amelia.nok, Bgy.ci.upper.amelia.nok)]
+g.amelia.nok <- g.amelia.nok[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.nok) & (Bgy.est.true <= Bgy.ci.upper.amelia.nok),
+                                     zero.in.ci = (0 >= Bgy.ci.lower.amelia.nok) & (0 <= Bgy.ci.upper.amelia.nok),
+                                     bias =  Bgy.est.amelia.nok - Bgy.est.true,
+                                     sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.nok))]
+
+g.amelia.nok.plot <- g.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+                                       mean.bias = mean(bias),
+                                     p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+                                     variable='g',
+                                     method='Multiple imputation (Classifier features unobserved)'
+                                     ),
+                                    by=c('N','m')]
+
+
+x.mecor <- df[,.(N,m,Bxy.est.true, Bxy.est.mecor,Bxy.lower.mecor, Bxy.upper.mecor)]
+
+x.mecor <- x.mecor[,':='(true.in.ci = (Bxy.est.true >= Bxy.lower.mecor) & (Bxy.est.true <= Bxy.upper.mecor),
+                                     zero.in.ci = (0 >= Bxy.lower.mecor) & (0 <= Bxy.upper.mecor),
+                                     bias =  Bxy.est.mecor - Bxy.est.true,
+                                     sign.correct = sign(Bxy.est.true) == sign(Bxy.est.mecor))]
+
+x.mecor.plot <- x.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+                                       mean.bias = mean(bias),
+                                     p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+                                     variable='x',
+                                     method='Regression Calibration'
+                                     ),
+                                    by=c('N','m')]
+
+g.mecor <- df[,.(N,m,Bgy.est.true, Bgy.est.mecor,Bgy.lower.mecor, Bgy.upper.mecor)]
+
+g.mecor <- g.mecor[,':='(true.in.ci = (Bgy.est.true >= Bgy.lower.mecor) & (Bgy.est.true <= Bgy.upper.mecor),
+                                     zero.in.ci = (0 >= Bgy.lower.mecor) & (0 <= Bgy.upper.mecor),
+                                     bias =  Bgy.est.mecor - Bgy.est.true,
+                                     sign.correct = sign(Bgy.est.true) == sign(Bgy.est.mecor))]
+
+g.mecor.plot <- g.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+                                       mean.bias = mean(bias),
+                                     p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+                                     variable='g',
+                                     method='Regression Calibration'
+                                     ),
+                                    by=c('N','m')]
+
+## x.mecor <- df[,.(N,m,Bgy.est.true, Bgy.est.mecor,Bgy.ci.lower.mecor, Bgy.ci.upper.mecor)]
+
+## x.mecor <- x.mecor[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.mecor) & (Bgy.est.true <= Bgy.ci.upper.mecor),
+##                                      zero.in.ci = (0 >= Bgy.ci.lower.mecor) & (0 <= Bgy.ci.upper.mecor),
+##                                      bias =  Bgy.est.mecor - Bgy.est.true,
+##                                      sign.correct = sign(Bgy.est.true) == sign(Bgy.est.mecor))]
+
+## x.mecor.plot <- x.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+##                                        mean.bias = mean(bias),
+##                                      p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+##                                      variable='g',
+##                                      method='Regression Calibration'
+##                                      ),
+##                                     by=c('N','m')]
+
+
+x.gmm <- df[,.(N,m,Bxy.est.true, Bxy.est.gmm,Bxy.ci.lower.gmm, Bxy.ci.upper.gmm)]
+x.gmm <- x.gmm[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.gmm) & (Bxy.est.true <= Bxy.ci.upper.gmm),
+                                     zero.in.ci = (0 >= Bxy.ci.lower.gmm) & (0 <= Bxy.ci.upper.gmm),
+                                     bias =  Bxy.est.gmm - Bxy.est.true,
+                                     sign.correct = sign(Bxy.est.true) == sign(Bxy.est.gmm))]
+
+x.gmm.plot <- x.gmm[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+                                       mean.bias = mean(bias),
+                                     p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+                                     variable='x',
+                                     method='2SLS+gmm'
+                                     ),
+                                    by=c('N','m')]
+
+g.gmm <- df[,.(N,m,Bgy.est.true, Bgy.est.gmm,Bgy.ci.lower.gmm, Bgy.ci.upper.gmm)]
+g.gmm <- g.gmm[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.gmm) & (Bgy.est.true <= Bgy.ci.upper.gmm),
+                                     zero.in.ci = (0 >= Bgy.ci.lower.gmm) & (0 <= Bgy.ci.upper.gmm),
+                                     bias =  Bgy.est.gmm - Bgy.est.true,
+                                     sign.correct = sign(Bgy.est.true) == sign(Bgy.est.gmm))]
+
+g.gmm.plot <- g.gmm[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+                                       mean.bias = mean(bias),
+                                     p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+                                     variable='g',
+                                     method='2SLS+gmm'
+                                     ),
+                                    by=c('N','m')]
+
+plot.df <- rbindlist(list(x.naive.plot,g.naive.plot,x.amelia.full.plot,g.amelia.full.plot,x.amelia.nok.plot,g.amelia.nok.plot, x.mecor.plot, g.mecor.plot, x.gmm.plot, g.gmm.plot))
+
+remember(plot.df,'example.1.plot.df')
+
+
+ggplot(plot.df,aes(y=N,x=m,color=p.sign.correct)) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='D') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size") 
+
+ggplot(plot.df,aes(y=N,x=m,color=abs(mean.bias))) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='D') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size") 
+
+ests <- df[,.(Bxy.est.true = mean(Bxy.est.true),
+              Bxy.est.naive = mean(Bxy.est.naive),
+              Bxy.est.feasible = mean(Bxy.est.feasible),
+              Bxy.est.amelia.full = mean(Bxy.est.amelia.full),
+              Bxy.est.amelia.nok = mean(Bxy.est.amelia.nok),
+              Bxy.est.mecor = mean(Bxy.est.mecor),
+              Bxy.est.gmm = mean(Bxy.est.gmm)),
+           by=c("N","m")]
diff --git a/simulations/plot_example_2.R b/simulations/plot_example_2.R
new file mode 100644 (file)
index 0000000..d5ca2b6
--- /dev/null
@@ -0,0 +1,102 @@
+library(arrow)
+library(data.table)
+library(ggplot2)
+
+df <- data.table(read_feather("example_2_simulation.feather"))
+
+x.naive <- df[,.(N, m, Bxy, Bxy.est.naive, Bxy.ci.lower.naive, Bxy.ci.upper.naive)]
+x.naive <- x.naive[,':='(true.in.ci = as.integer((Bxy >= Bxy.ci.lower.naive) & (Bxy <= Bxy.ci.upper.naive)),
+                         zero.in.ci = (0 >= Bxy.ci.lower.naive) & (0 <= Bxy.ci.upper.naive),
+                         bias = Bxy - Bxy.est.naive,
+                         sign.correct = as.integer(sign(Bxy) == sign(Bxy.est.naive)))]
+
+x.naive.plot <- x.naive[,.(p.true.in.ci = mean(true.in.ci),
+                           mean.bias = mean(bias),
+                           p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+                           variable='x',
+                           method='Naive'
+                           ),
+                        by=c('N','m')]
+    
+g.naive <- df[,.(N, m, Bgy, Bgy.est.naive, Bgy.ci.lower.naive, Bgy.ci.upper.naive)]
+g.naive <- g.naive[,':='(true.in.ci = as.integer((Bgy >= Bgy.ci.lower.naive) & (Bgy <= Bgy.ci.upper.naive)),
+                         zero.in.ci = (0 >= Bgy.ci.lower.naive) & (0 <= Bgy.ci.upper.naive),
+                         bias = Bgy - Bgy.est.naive,
+                         sign.correct = as.integer(sign(Bgy) == sign(Bgy.est.naive)))]
+
+g.naive.plot <- g.naive[,.(p.true.in.ci = mean(true.in.ci),
+                           mean.bias = mean(bias),
+                           p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+                           variable='g',
+                           method='Naive'
+                           ),
+                        by=c('N','m')]
+    
+
+
+x.amelia.full <- x.amelia.full[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.full) & (Bxy.est.true <= Bxy.ci.upper.amelia.full),
+                                     zero.in.ci = (0 >= Bxy.ci.lower.amelia.full) & (0 <= Bxy.ci.upper.amelia.full),
+                                     bias = Bxy.est.true - Bxy.est.amelia.full,
+                                     sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.full))]
+
+x.amelia.full.plot <- x.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+                                       mean.bias = mean(bias),
+                                       p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+                                       variable='x',
+                                       method='Multiple imputation'
+                                       ),
+                                    by=c('N','m')]
+
+
+
+g.amelia.full <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.full, Bgy.ci.lower.amelia.full, Bgy.ci.upper.amelia.full)]
+g.amelia.full <- g.amelia.full[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.full) & (Bgy.est.true <= Bgy.ci.upper.amelia.full),
+                                     zero.in.ci = (0 >= Bgy.ci.lower.amelia.full) & (0 <= Bgy.ci.upper.amelia.full),
+                                     bias =  Bgy.est.amelia.full - Bgy.est.true,
+                                     sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.full))]
+
+g.amelia.full.plot <- g.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+                                       mean.bias = mean(bias),
+                                       p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+                                       variable='g',
+                                       method='Multiple imputation'
+                                       ),
+                                    by=c('N','m')]
+
+
+
+
+x.amelia.nok <- df[,.(N, m, Bxy.est.true, Bxy.est.amelia.nok, Bxy.ci.lower.amelia.nok, Bxy.ci.upper.amelia.nok)]
+x.amelia.nok <- x.amelia.nok[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.nok) & (Bxy.est.true <= Bxy.ci.upper.amelia.nok),
+                                     zero.in.ci = (0 >= Bxy.ci.lower.amelia.nok) & (0 <= Bxy.ci.upper.amelia.nok),
+                                     bias =  Bxy.est.amelia.nok - Bxy.est.true,
+                                     sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.nok))]
+
+ x.amelia.nok.plot <- x.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+                                       mean.bias = mean(bias),
+                                     p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+                                     variable='x',
+                                     method='Multiple imputation (Classifier features unobserved)'
+                                     ),
+                                    by=c('N','m')]
+
+g.amelia.nok <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.nok, Bgy.ci.lower.amelia.nok, Bgy.ci.upper.amelia.nok)]
+g.amelia.nok <- g.amelia.nok[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.nok) & (Bgy.est.true <= Bgy.ci.upper.amelia.nok),
+                                     zero.in.ci = (0 >= Bgy.ci.lower.amelia.nok) & (0 <= Bgy.ci.upper.amelia.nok),
+                                     bias =  Bgy.est.amelia.nok - Bgy.est.true,
+                                     sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.nok))]
+
+g.amelia.nok.plot <- g.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+                                       mean.bias = mean(bias),
+                                     p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+                                     variable='g',
+                                     method='Multiple imputation (Classifier features unobserved)'
+                                     ),
+                                    by=c('N','m')]
+
+
+plot.df <- rbindlist(list(x.naive.plot,g.naive.plot,x.amelia.full.plot,g.amelia.full.plot,x.amelia.nok.plot,g.amelia.nok.plot))
+
+ggplot(plot.df,aes(y=N,x=m,color=p.sign.correct)) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='C') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size") 
+
+kggplot(plot.df,aes(y=N,x=m,color=abs(mean.bias))) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='C') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size") 
diff --git a/simulations/plot_example_2_B.R b/simulations/plot_example_2_B.R
new file mode 100644 (file)
index 0000000..62c6848
--- /dev/null
@@ -0,0 +1,277 @@
+source("RemembR/R/RemembeR.R")
+library(arrow)
+library(data.table)
+library(ggplot2)
+library(filelock)
+library(argparse)
+
+l <- filelock::lock("example_2_B.feather_lock",exclusive=FALSE)
+df <- data.table(read_feather("example_2_B.feather"))
+filelock::unlock(l)
+
+parser <- arg_parser("Simulate data and fit corrected models.")
+parser <- add_argument(parser, "--infile", default="", help="name of the file to read.")
+parser <- add_argument(parser, "--name", default="", help="The name to safe the data to in the remember file.")
+args <- parse_args(parser)
+
+build_plot_dataset <- function(df){
+    x.naive <- df[,.(N, m, Bxy, Bxy.est.naive, Bxy.ci.lower.naive, Bxy.ci.upper.naive)]
+    x.naive <- x.naive[,':='(true.in.ci = as.integer((Bxy >= Bxy.ci.lower.naive) & (Bxy <= Bxy.ci.upper.naive)),
+                             zero.in.ci = (0 >= Bxy.ci.lower.naive) & (0 <= Bxy.ci.upper.naive),
+                             bias = Bxy - Bxy.est.naive,
+                             Bxy.est.naive = Bxy.est.naive,
+                             sign.correct = as.integer(sign(Bxy) == sign(Bxy.est.naive)))]
+
+    x.naive.plot <- x.naive[,.(p.true.in.ci = mean(true.in.ci),
+                               mean.bias = mean(bias),
+                               mean.est = mean(Bxy.est.naive),
+                               var.est = var(Bxy.est.naive),
+                               N.sims = .N,
+                               p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+                               variable='x',
+                               method='Naive'
+                               ),
+                            by=c('N','m')]
+    
+
+    g.naive <- df[,.(N, m, Bgy, Bgy.est.naive, Bgy.ci.lower.naive, Bgy.ci.upper.naive)]
+    g.naive <- g.naive[,':='(true.in.ci = as.integer((Bgy >= Bgy.ci.lower.naive) & (Bgy <= Bgy.ci.upper.naive)),
+                             zero.in.ci = (0 >= Bgy.ci.lower.naive) & (0 <= Bgy.ci.upper.naive),
+                             bias = Bgy - Bgy.est.naive,
+                             Bgy.est.naive = Bgy.est.naive,
+                             sign.correct = as.integer(sign(Bgy) == sign(Bgy.est.naive)))]
+
+    g.naive.plot <- g.naive[,.(p.true.in.ci = mean(true.in.ci),
+                               mean.bias = mean(bias),
+                               mean.est = mean(Bgy.est.naive),
+                               var.est = var(Bgy.est.naive),
+                               N.sims = .N,
+                               p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+                               variable='g',
+                               method='Naive'
+                               ),
+                            by=c('N','m')]
+    
+
+    x.feasible <- df[,.(N, m, Bxy, Bxy.est.feasible, Bxy.ci.lower.feasible, Bxy.ci.upper.feasible)]
+    x.feasible <- x.feasible[,':='(true.in.ci = as.integer((Bxy >= Bxy.ci.lower.feasible) & (Bxy <= Bxy.ci.upper.feasible)),
+                                   zero.in.ci = (0 >= Bxy.ci.lower.feasible) & (0 <= Bxy.ci.upper.feasible),
+                                   bias = Bxy - Bxy.est.feasible,
+                                   Bxy.est.feasible = Bxy.est.feasible,
+                                   sign.correct = as.integer(sign(Bxy) == sign(Bxy.est.feasible)))]
+
+    x.feasible.plot <- x.feasible[,.(p.true.in.ci = mean(true.in.ci),
+                                     mean.bias = mean(bias),
+                                     mean.est = mean(Bxy.est.feasible),
+                                     var.est = var(Bxy.est.feasible),
+                                     N.sims = .N,
+                                     p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+                                     variable='x',
+                                     method='Feasible'
+                                     ),
+                                  by=c('N','m')]
+    
+
+    g.feasible <- df[,.(N, m, Bgy, Bgy.est.feasible, Bgy.ci.lower.feasible, Bgy.ci.upper.feasible)]
+    g.feasible <- g.feasible[,':='(true.in.ci = as.integer((Bgy >= Bgy.ci.lower.feasible) & (Bgy <= Bgy.ci.upper.feasible)),
+                                   zero.in.ci = (0 >= Bgy.ci.lower.feasible) & (0 <= Bgy.ci.upper.feasible),
+                                   bias = Bgy - Bgy.est.feasible,
+                                   Bgy.est.feasible = Bgy.est.feasible,
+                                   sign.correct = as.integer(sign(Bgy) == sign(Bgy.est.feasible)))]
+
+    g.feasible.plot <- g.feasible[,.(p.true.in.ci = mean(true.in.ci),
+                                     mean.bias = mean(bias),
+                                     mean.est = mean(Bgy.est.feasible),
+                                     var.est = var(Bgy.est.feasible),
+                                     N.sims = .N,
+                                     p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+                                     variable='g',
+                                     method='Feasible'
+                                     ),
+                                  by=c('N','m')]
+    
+
+
+    x.amelia.full <- df[,.(N, m, Bxy, Bxy.est.true, Bxy.ci.lower.amelia.full, Bxy.ci.upper.amelia.full, Bxy.est.amelia.full)]
+
+    x.amelia.full <- x.amelia.full[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.full) & (Bxy.est.true <= Bxy.ci.upper.amelia.full),
+                                         zero.in.ci = (0 >= Bxy.ci.lower.amelia.full) & (0 <= Bxy.ci.upper.amelia.full),
+                                         bias = Bxy.est.true - Bxy.est.amelia.full,
+                                         sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.full))]
+
+    x.amelia.full.plot <- x.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+                                           mean.bias = mean(bias),
+                                           p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+                                           mean.est = mean(Bxy.est.amelia.full),
+                                           var.est = var(Bxy.est.amelia.full),
+                                           N.sims = .N,
+                                           variable='x',
+                                           method='Multiple imputation'
+                                           ),
+                                        by=c('N','m')]
+
+
+    g.amelia.full <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.full, Bgy.ci.lower.amelia.full, Bgy.ci.upper.amelia.full)]
+    g.amelia.full <- g.amelia.full[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.full) & (Bgy.est.true <= Bgy.ci.upper.amelia.full),
+                                         zero.in.ci = (0 >= Bgy.ci.lower.amelia.full) & (0 <= Bgy.ci.upper.amelia.full),
+                                         bias =  Bgy.est.amelia.full - Bgy.est.true,
+                                         sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.full))]
+
+    g.amelia.full.plot <- g.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+                                           mean.bias = mean(bias),
+                                           p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+                                           mean.est = mean(Bgy.est.amelia.full),
+                                           var.est = var(Bgy.est.amelia.full),
+                                           N.sims = .N,
+                                           variable='g',
+                                           method='Multiple imputation'
+                                           ),
+                                        by=c('N','m')]
+
+    x.amelia.nok <- df[,.(N, m, Bxy.est.true, Bxy.est.amelia.nok, Bxy.ci.lower.amelia.nok, Bxy.ci.upper.amelia.nok)]
+    x.amelia.nok <- x.amelia.nok[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.nok) & (Bxy.est.true <= Bxy.ci.upper.amelia.nok),
+                                       zero.in.ci = (0 >= Bxy.ci.lower.amelia.nok) & (0 <= Bxy.ci.upper.amelia.nok),
+                                       bias =  Bxy.est.amelia.nok - Bxy.est.true,
+                                       sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.nok))]
+
+    x.amelia.nok.plot <- x.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+                                         mean.bias = mean(bias),
+                                         p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+                                         mean.est = mean(Bxy.est.amelia.nok),
+                                         var.est = var(Bxy.est.amelia.nok),
+                                         N.sims = .N,
+                                         variable='x',
+                                         method='Multiple imputation (Classifier features unobserved)'
+                                         ),
+                                      by=c('N','m')]
+
+    g.amelia.nok <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.nok, Bgy.ci.lower.amelia.nok, Bgy.ci.upper.amelia.nok)]
+    g.amelia.nok <- g.amelia.nok[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.nok) & (Bgy.est.true <= Bgy.ci.upper.amelia.nok),
+                                       zero.in.ci = (0 >= Bgy.ci.lower.amelia.nok) & (0 <= Bgy.ci.upper.amelia.nok),
+                                       bias =  Bgy.est.amelia.nok - Bgy.est.true,
+                                       sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.nok))]
+
+    g.amelia.nok.plot <- g.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+                                         mean.bias = mean(bias),
+                                         p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+                                         mean.est = mean(Bgy.est.amelia.nok),
+                                         var.est = var(Bgy.est.amelia.nok),
+                                         N.sims = .N,
+                                         variable='g',
+                                         method="Multiple imputation (Classifier features unobserved)"
+                                         ),
+                                      by=c('N','m')]
+
+
+    x.mecor <- df[,.(N,m,Bxy.est.true, Bxy.est.mecor,Bxy.lower.mecor, Bxy.upper.mecor)]
+
+    x.mecor <- x.mecor[,':='(true.in.ci = (Bxy.est.true >= Bxy.lower.mecor) & (Bxy.est.true <= Bxy.upper.mecor),
+                             zero.in.ci = (0 >= Bxy.lower.mecor) & (0 <= Bxy.upper.mecor),
+                             bias =  Bxy.est.mecor - Bxy.est.true,
+                             sign.correct = sign(Bxy.est.true) == sign(Bxy.est.mecor))]
+
+    x.mecor.plot <- x.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+                               mean.bias = mean(bias),
+                               p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+                               mean.est = mean(Bxy.est.mecor),
+                               var.est = var(Bxy.est.mecor),
+                               N.sims = .N,
+                               variable='x',
+                               method='Regression Calibration'
+                               ),
+                            by=c('N','m')]
+
+    g.mecor <- df[,.(N,m,Bgy.est.true, Bgy.est.mecor,Bgy.lower.mecor, Bgy.upper.mecor)]
+
+    g.mecor <- g.mecor[,':='(true.in.ci = (Bgy.est.true >= Bgy.lower.mecor) & (Bgy.est.true <= Bgy.upper.mecor),
+                             zero.in.ci = (0 >= Bgy.lower.mecor) & (0 <= Bgy.upper.mecor),
+                             bias =  Bgy.est.mecor - Bgy.est.true,
+                             sign.correct = sign(Bgy.est.true) == sign(Bgy.est.mecor))]
+
+    g.mecor.plot <- g.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+                               mean.bias = mean(bias),
+                               p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+                               mean.est = mean(Bgy.est.mecor),
+                               var.est = var(Bgy.est.mecor),
+                               N.sims = .N,
+                               variable='g',
+                               method='Regression Calibration'
+                               ),
+                            by=c('N','m')]
+
+    ## x.mecor <- df[,.(N,m,Bgy.est.true, Bgy.est.mecor,Bgy.ci.lower.mecor, Bgy.ci.upper.mecor)]
+
+    ## x.mecor <- x.mecor[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.mecor) & (Bgy.est.true <= Bgy.ci.upper.mecor),
+    ##                                      zero.in.ci = (0 >= Bgy.ci.lower.mecor) & (0 <= Bgy.ci.upper.mecor),
+    ##                                      bias =  Bgy.est.mecor - Bgy.est.true,
+    ##                                      sign.correct = sign(Bgy.est.true) == sign(Bgy.est.mecor))]
+
+    ## x.mecor.plot <- x.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+    ##                                        mean.bias = mean(bias),
+    ##                                      p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+    ##                                      variable='g',
+    ##                                      method='Regression Calibration'
+    ##                                      ),
+    ##                                     by=c('N','m')]
+
+
+    x.gmm <- df[,.(N,m,Bxy.est.true, Bxy.est.gmm,Bxy.ci.lower.gmm, Bxy.ci.upper.gmm)]
+    x.gmm <- x.gmm[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.gmm) & (Bxy.est.true <= Bxy.ci.upper.gmm),
+                         zero.in.ci = (0 >= Bxy.ci.lower.gmm) & (0 <= Bxy.ci.upper.gmm),
+                         bias =  Bxy.est.gmm - Bxy.est.true,
+                         sign.correct = sign(Bxy.est.true) == sign(Bxy.est.gmm))]
+
+    x.gmm.plot <- x.gmm[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+                           mean.bias = mean(bias),
+                           p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+                           mean.est = mean(Bxy.est.gmm),
+                           var.est = var(Bxy.est.gmm),
+                           N.sims = .N,
+                           variable='x',
+                           method='2SLS+gmm'
+                           ),
+                        by=c('N','m')]
+
+    g.gmm <- df[,.(N,m,Bgy.est.true, Bgy.est.gmm,Bgy.ci.lower.gmm, Bgy.ci.upper.gmm)]
+    g.gmm <- g.gmm[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.gmm) & (Bgy.est.true <= Bgy.ci.upper.gmm),
+                         zero.in.ci = (0 >= Bgy.ci.lower.gmm) & (0 <= Bgy.ci.upper.gmm),
+                         bias =  Bgy.est.gmm - Bgy.est.true,
+                         sign.correct = sign(Bgy.est.true) == sign(Bgy.est.gmm))]
+
+    g.gmm.plot <- g.gmm[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+                           mean.bias = mean(bias),
+                           p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+                           mean.est = mean(Bgy.est.gmm),
+                           var.est = var(Bgy.est.gmm),
+                           N.sims = .N,
+                           variable='g',
+                           method='2SLS+gmm'
+                           ),
+                        by=c('N','m')]
+
+    accuracy <- df[,mean(accuracy)]
+    return(plot.df)
+}
+
+df <- read_feather(args$infile)
+plot.df <- build_plot_dataset(df)
+remember(plot.df,args$name))
+
+
+## df[gmm.ER_pval<0.05]
+
+
+
+## plot.df <- rbindlist(list(x.naive.plot,g.naive.plot,x.amelia.full.plot,g.amelia.full.plot,x.amelia.nok.plot,g.amelia.nok.plot, x.mecor.plot, g.mecor.plot, x.gmm.plot, g.gmm.plot, x.feasible.plot, g.feasible.plot),use.names=T)
+
+## plot.df[,accuracy := accuracy]
+
+## # plot.df <- plot.df[,":="(sd.est=sqrt(var.est)/N.sims)]
+
+
+
+## ggplot(plot.df[variable=='x'], aes(y=mean.est, ymax=mean.est + var.est/2, ymin=mean.est-var.est/2, x=method)) + geom_pointrange() + facet_grid(-m~N) + scale_x_discrete(labels=label_wrap_gen(10))
+
+## ggplot(plot.df,aes(y=N,x=m,color=p.sign.correct)) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='D') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size") 
+
+## ggplot(plot.df,aes(y=N,x=m,color=abs(mean.bias))) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='D') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size") 
diff --git a/simulations/plot_example_2_continuous.R b/simulations/plot_example_2_continuous.R
new file mode 100644 (file)
index 0000000..9b41076
--- /dev/null
@@ -0,0 +1,102 @@
+library(arrow)
+library(data.table)
+library(ggplot2)
+
+df <- data.table(read_feather("example_2_simulation.feather"))
+
+x.naive <- df[,.(N, m, Bxy, Bxy.est.naive, Bxy.ci.lower.naive, Bxy.ci.upper.naive)]
+x.naive <- x.naive[,':='(true.in.ci = as.integer((Bxy >= Bxy.ci.lower.naive) & (Bxy <= Bxy.ci.upper.naive)),
+                         zero.in.ci = (0 >= Bxy.ci.lower.naive) & (0 <= Bxy.ci.upper.naive),
+                         bias = Bxy - Bxy.est.naive,
+                         sign.correct = as.integer(sign(Bxy) == sign(Bxy.est.naive)))]
+
+x.naive.plot <- x.naive[,.(p.true.in.ci = mean(true.in.ci),
+                           mean.bias = mean(bias),
+                           p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+                           variable='x',
+                           method='Naive'
+                           ),
+                        by=c('N','m')]
+    
+g.naive <- df[,.(N, m, Bgy, Bgy.est.naive, Bgy.ci.lower.naive, Bgy.ci.upper.naive)]
+g.naive <- g.naive[,':='(true.in.ci = as.integer((Bgy >= Bgy.ci.lower.naive) & (Bgy <= Bgy.ci.upper.naive)),
+                         zero.in.ci = (0 >= Bgy.ci.lower.naive) & (0 <= Bgy.ci.upper.naive),
+                         bias = Bgy - Bgy.est.naive,
+                         sign.correct = as.integer(sign(Bgy) == sign(Bgy.est.naive)))]
+
+g.naive.plot <- g.naive[,.(p.true.in.ci = mean(true.in.ci),
+                           mean.bias = mean(bias),
+                           p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+                           variable='g',
+                           method='Naive'
+                           ),
+                        by=c('N','m')]
+    
+
+
+x.amelia.full <- x.amelia.full[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.full) & (Bxy.est.true <= Bxy.ci.upper.amelia.full),
+                                     zero.in.ci = (0 >= Bxy.ci.lower.amelia.full) & (0 <= Bxy.ci.upper.amelia.full),
+                                     bias = Bxy.est.true - Bxy.est.amelia.full,
+                                     sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.full))]
+
+x.amelia.full.plot <- x.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+                                       mean.bias = mean(bias),
+                                       p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+                                       variable='x',
+                                       method='Multiple imputation'
+                                       ),
+                                    by=c('N','m')]
+
+
+
+g.amelia.full <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.full, Bgy.ci.lower.amelia.full, Bgy.ci.upper.amelia.full)]
+g.amelia.full <- g.amelia.full[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.full) & (Bgy.est.true <= Bgy.ci.upper.amelia.full),
+                                     zero.in.ci = (0 >= Bgy.ci.lower.amelia.full) & (0 <= Bgy.ci.upper.amelia.full),
+                                     bias =  Bgy.est.amelia.full - Bgy.est.true,
+                                     sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.full))]
+
+g.amelia.full.plot <- g.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+                                       mean.bias = mean(bias),
+                                       p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+                                       variable='g',
+                                       method='Multiple imputation'
+                                       ),
+                                    by=c('N','m')]
+
+
+
+
+x.amelia.nok <- df[,.(N, m, Bxy.est.true, Bxy.est.amelia.nok, Bxy.ci.lower.amelia.nok, Bxy.ci.upper.amelia.nok)]
+x.amelia.nok <- x.amelia.nok[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.nok) & (Bxy.est.true <= Bxy.ci.upper.amelia.nok),
+                                     zero.in.ci = (0 >= Bxy.ci.lower.amelia.nok) & (0 <= Bxy.ci.upper.amelia.nok),
+                                     bias =  Bxy.est.amelia.nok - Bxy.est.true,
+                                     sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.nok))]
+
+ x.amelia.nok.plot <- x.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+                                       mean.bias = mean(bias),
+                                     p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+                                     variable='x',
+                                     method='Multiple imputation (Classifier features unobserved)'
+                                     ),
+                                    by=c('N','m')]
+
+g.amelia.nok <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.nok, Bgy.ci.lower.amelia.nok, Bgy.ci.upper.amelia.nok)]
+g.amelia.nok <- g.amelia.nok[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.nok) & (Bgy.est.true <= Bgy.ci.upper.amelia.nok),
+                                     zero.in.ci = (0 >= Bgy.ci.lower.amelia.nok) & (0 <= Bgy.ci.upper.amelia.nok),
+                                     bias =  Bgy.est.amelia.nok - Bgy.est.true,
+                                     sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.nok))]
+
+g.amelia.nok.plot <- g.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
+                                       mean.bias = mean(bias),
+                                     p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
+                                     variable='g',
+                                     method='Multiple imputation (Classifier features unobserved)'
+                                     ),
+                                    by=c('N','m')]
+
+
+plot.df <- rbindlist(list(x.naive.plot,g.naive.plot,x.amelia.full.plot,g.amelia.full.plot,x.amelia.nok.plot,g.amelia.nok.plot))
+
+ggplot(plot.df,aes(y=N,x=m,color=p.sign.correct)) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c() + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size") 
+
+ggplot(plot.df,aes(y=N,x=m,color=mean.bias)) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c() + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size") 
diff --git a/simulations/simulation_base.R b/simulations/simulation_base.R
new file mode 100644 (file)
index 0000000..26916ab
--- /dev/null
@@ -0,0 +1,155 @@
+library(predictionError)
+library(mecor)
+options(amelia.parallel="no",
+        amelia.ncpus=1)
+library(Amelia)
+library(Zelig)
+
+logistic <- function(x) {1/(1+exp(-1*x))}
+
+run_simulation <-  function(df, result){
+
+    accuracy <- df[,mean(w_pred==x)]
+    result <- append(result, list(accuracy=accuracy))
+
+    (model.true <- lm(y ~ x + g, data=df))
+    true.ci.Bxy <- confint(model.true)['x',]
+    true.ci.Bgy <- confint(model.true)['g',]
+
+    result <- append(result, list(Bxy.est.true=coef(model.true)['x'],
+                                  Bgy.est.true=coef(model.true)['g'],
+                                  Bxy.ci.upper.true = true.ci.Bxy[2],
+                                  Bxy.ci.lower.true = true.ci.Bxy[1],
+                                  Bgy.ci.upper.true = true.ci.Bgy[2],
+                                  Bgy.ci.lower.true = true.ci.Bgy[1]))
+                                  
+    (model.feasible <- lm(y~x.obs+g,data=df))
+
+    feasible.ci.Bxy <- confint(model.feasible)['x.obs',]
+    result <- append(result, list(Bxy.est.feasible=coef(model.feasible)['x.obs'],
+                                  Bxy.ci.upper.feasible = feasible.ci.Bxy[2],
+                                  Bxy.ci.lower.feasible = feasible.ci.Bxy[1]))
+
+    feasible.ci.Bgy <- confint(model.feasible)['g',]
+    result <- append(result, list(Bgy.est.feasible=coef(model.feasible)['g'],
+                                  Bgy.ci.upper.feasible = feasible.ci.Bgy[2],
+                                  Bgy.ci.lower.feasible = feasible.ci.Bgy[1]))
+
+    (model.naive <- lm(y~w+g, data=df))
+    
+    naive.ci.Bxy <- confint(model.naive)['w',]
+    naive.ci.Bgy <- confint(model.naive)['g',]
+
+    result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'],
+                                  Bgy.est.naive=coef(model.naive)['g'],
+                                  Bxy.ci.upper.naive = naive.ci.Bxy[2],
+                                  Bxy.ci.lower.naive = naive.ci.Bxy[1],
+                                  Bgy.ci.upper.naive = naive.ci.Bgy[2],
+                                  Bgy.ci.lower.naive = naive.ci.Bgy[1]))
+                                  
+
+    ## multiple imputation when k is observed
+    ## amelia does great at this one.
+    noms <- c()
+    if(length(unique(df$x.obs)) <=2){
+        noms <- c(noms, 'x.obs')
+    }
+
+    if(length(unique(df$g)) <=2){
+        noms <- c(noms, 'g')
+    }
+
+
+    amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w_pred'),noms=noms)
+    mod.amelia.k <- zelig(y~x.obs+g, model='ls', data=amelia.out.k$imputations, cite=FALSE)
+    (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE))
+
+    est.x.mi <- coefse['x.obs','Estimate']
+    est.x.se <- coefse['x.obs','Std.Error']
+    result <- append(result,
+                     list(Bxy.est.amelia.full = est.x.mi,
+                          Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
+                          Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
+                          ))
+
+    est.g.mi <- coefse['g','Estimate']
+    est.g.se <- coefse['g','Std.Error']
+
+    result <- append(result,
+                     list(Bgy.est.amelia.full = est.g.mi,
+                          Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se,
+                          Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se
+                          ))
+
+    ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model.
+    amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k","w_pred"), noms=noms)
+    mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE)
+    (coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE))
+
+    est.x.mi <- coefse['x.obs','Estimate']
+    est.x.se <- coefse['x.obs','Std.Error']
+    result <- append(result,
+                     list(Bxy.est.amelia.nok = est.x.mi,
+                          Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se,
+                          Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se
+                          ))
+
+    est.g.mi <- coefse['g','Estimate']
+    est.g.se <- coefse['g','Std.Error']
+
+    result <- append(result,
+                     list(Bgy.est.amelia.nok = est.g.mi,
+                          Bgy.ci.upper.amelia.nok = est.g.mi + 1.96 * est.g.se,
+                          Bgy.ci.lower.amelia.nok = est.g.mi - 1.96 * est.g.se
+                          ))
+
+    N <- nrow(df)
+    m <- nrow(df[!is.na(x.obs)])
+    p <- v <- train <- rep(0,N)
+    M <- m
+    p[(M+1):(N)] <- 1
+    v[1:(M)] <- 1
+    df <- df[order(x.obs)]
+    y <- df[,y]
+    x <- df[,x.obs]
+    g <- df[,g]
+    w <- df[,w]
+    # gmm gets pretty close
+    (gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=TRUE))
+
+    result <- append(result,
+                     list(Bxy.est.gmm = gmm.res$beta[1,1],
+                          Bxy.ci.upper.gmm = gmm.res$confint[1,2],
+                          Bxy.ci.lower.gmm = gmm.res$confint[1,1],
+                          gmm.ER_pval = gmm.res$ER_pval
+                          ))
+
+    result <- append(result,
+                     list(Bgy.est.gmm = gmm.res$beta[2,1],
+                          Bgy.ci.upper.gmm = gmm.res$confint[2,2],
+                          Bgy.ci.lower.gmm = gmm.res$confint[2,1]))
+
+
+    mod.calibrated.mle <- mecor(y ~ MeasError(w, reference = x.obs) + g, df, B=400, method='efficient')
+    (mod.calibrated.mle)
+    (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',])
+    result <- append(result, list(
+                                 Bxy.est.mecor = mecor.ci['Estimate'],
+                                 Bxy.upper.mecor = mecor.ci['UCI'],
+                                 Bxy.lower.mecor = mecor.ci['LCI'])
+                     )
+
+    (mecor.ci <- summary(mod.calibrated.mle)$c$ci['g',])
+
+    result <- append(result, list(
+                                 Bgy.est.mecor = mecor.ci['Estimate'],
+                                 Bgy.upper.mecor = mecor.ci['UCI'],
+                                 Bgy.lower.mecor = mecor.ci['LCI'])
+                     )
+
+##    clean up memory
+    rm(list=c("df","y","x","g","w","v","train","p","amelia.out.k","amelia.out.nok", "mod.calibrated.mle","gmm.res","mod.amelia.k","mod.amelia.nok", "model.true","model.naive","model.feasible"))
+    
+    gc()
+    return(result)
+}

Community Data Science Collective || Want to submit a patch?