X-Git-Url: https://code.communitydata.science/ml_measurement_error_public.git/blobdiff_plain/e41d11afb9a80180feff844666e3ee463d20a7cd..f8f58301e0285118f7b669a96ed9367a9914ba02:/simulations/example_3.R diff --git a/simulations/example_3.R b/simulations/example_3.R deleted file mode 100644 index 389dba1..0000000 --- a/simulations/example_3.R +++ /dev/null @@ -1,144 +0,0 @@ -### EXAMPLE 2_b: demonstrates how measurement error can lead to a type sign error in a covariate -### What kind of data invalidates fong + tyler? -### This is the same as example 2, only instead of x->k we have k->x. -### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign. -### Even when you include the proxy variable in the regression. -### But with some ground truth and multiple imputation, you can fix it. - -library(argparser) -library(mecor) -library(ggplot2) -library(data.table) -library(filelock) -library(arrow) -library(Amelia) -library(Zelig) -library(predictionError) -options(amelia.parallel="no", - amelia.ncpus=1) -setDTthreads(40) - -source("simulation_base.R") - -## SETUP: -### we want to estimate x -> y; x is MAR -### we have x -> k; k -> w; x -> w is used to predict x via the model w. -### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments -### The labels x are binary, but the model provides a continuous predictor - -### simulation: -#### how much power do we get from the model in the first place? (sweeping N and m) -#### - -## one way to do it is by adding correlation to x.obs and y that isn't in w. -## in other words, the model is missing an important feature of x.obs that's related to y. -simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){ - set.seed(seed) - - ## the true value of x - - g <- rbinom(N, 1, 0.5) - - # make w and y dependent - u <- rnorm(N,0,Bxy) - - xprime <- Bgx * g + rnorm(N,0,1) - - k <- Bkx*xprime + rnorm(N,0,1.5) + 1.1*Bkx*u - - x <- as.integer(logistic(scale(xprime)) > 0.5) - - y <- Bxy * x + Bgy * g + rnorm(N, 0, 1) + B0 + u - - df <- data.table(x=x,k=k,y=y,g=g) - - w.model <- glm(x ~ k,df, family=binomial(link='logit')) - - if( m < N){ - df <- df[sample(nrow(df), m), x.obs := x] - } else { - df <- df[, x.obs := x] - } - - df[, x.obs := x.obs] - - w <- predict(w.model, df) + rnorm(N, 0, 1) - ## y = B0 + B1x + e - - df[,':='(w=w, w_pred = as.integer(w>0.5),u=u)] - return(df) -} - -## simulate_latent_cocause_2 <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){ -## set.seed(seed) - -## ## the true value of x - -## g <- rbinom(N, 1, 0.5) - -## # make w and y dependent -## u <- rnorm(N,0,5) - -## xprime <- Bgx * g + rnorm(N,0,1) - -## k <- Bkx*xprime + rnorm(N,0,3) - -## x <- as.integer(logistic(scale(xprime+0.3)) > 0.5) - -## y <- Bxy * x + Bgy * g + rnorm(N, 0, 1) + B0 + u - -## df <- data.table(x=x,k=k,y=y,g=g) - -## w.model <- glm(x ~ k, df, family=binomial(link='logit')) - -## if( m < N){ -## df <- df[sample(nrow(df), m), x.obs := x] -## } else { -## df <- df[, x.obs := x] -## } - -## w <- predict(w.model,data.frame(k=k)) + u -## ## y = B0 + B1x + e - -## df[,':='(w=w, w_pred = as.integer(w>0.5),u=u)] -## return(df) -## } - - -schennach <- function(df){ - - fwx <- glm(x.obs~w, df, family=binomial(link='logit')) - df[,xstar_pred := predict(fwx, df)] - gxt <- lm(y ~ xstar_pred+g, df) - -} - - -parser <- arg_parser("Simulate data and fit corrected models") -parser <- add_argument(parser, "--N", default=5000, help="number of observations of w") -parser <- add_argument(parser, "--m", default=200, help="m the number of ground truth observations") -parser <- add_argument(parser, "--seed", default=432, help='seed for the rng') -parser <- add_argument(parser, "--outfile", help='output file', default='example_2.feather') -args <- parse_args(parser) - -B0 <- 0 -Bxy <- 0.2 -Bgy <- 0 -Bkx <- 2 -Bgx <- 0 - - -outline <- run_simulation(simulate_latent_cocause(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed) - ,list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=args$seed)) - -outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE) -if(file.exists(args$outfile)){ - logdata <- read_feather(args$outfile) - logdata <- rbind(logdata,as.data.table(outline)) -} else { - logdata <- as.data.table(outline) -} - -print(outline) -write_feather(logdata, args$outfile) -unlock(outfile_lock)