X-Git-Url: https://code.communitydata.science/ml_measurement_error_public.git/blobdiff_plain/e41d11afb9a80180feff844666e3ee463d20a7cd..f8f58301e0285118f7b669a96ed9367a9914ba02:/simulations/example_3.R

diff --git a/simulations/example_3.R b/simulations/example_3.R
deleted file mode 100644
index 389dba1..0000000
--- a/simulations/example_3.R
+++ /dev/null
@@ -1,144 +0,0 @@
-### EXAMPLE 2_b: demonstrates how measurement error can lead to a type sign error in a covariate
-### What kind of data invalidates fong + tyler?
-### This is the same as example 2, only instead of x->k we have k->x.
-### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
-### Even when you include the proxy variable in the regression.
-### But with some ground truth and multiple imputation, you can fix it.
-
-library(argparser)
-library(mecor)
-library(ggplot2)
-library(data.table)
-library(filelock)
-library(arrow)
-library(Amelia)
-library(Zelig)
-library(predictionError)
-options(amelia.parallel="no",
-        amelia.ncpus=1)
-setDTthreads(40)
-
-source("simulation_base.R")
-
-## SETUP:
-### we want to estimate x -> y; x is MAR
-### we have x -> k; k -> w; x -> w is used to predict x via the model w.
-### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments
-### The labels x are binary, but the model provides a continuous predictor
-
-### simulation:
-#### how much power do we get from the model in the first place? (sweeping N and m)
-#### 
-
-## one way to do it is by adding correlation to x.obs and y that isn't in w.
-## in other words, the model is missing an important feature of x.obs that's related to y.
-simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
-    set.seed(seed)
-
-    ## the true value of x
-
-    g <- rbinom(N, 1, 0.5)
-
-    # make w and y dependent
-    u <- rnorm(N,0,Bxy)
-
-    xprime <- Bgx * g + rnorm(N,0,1) 
-
-    k <- Bkx*xprime + rnorm(N,0,1.5) + 1.1*Bkx*u
-
-    x <- as.integer(logistic(scale(xprime)) > 0.5)
-
-    y <-  Bxy * x  + Bgy * g + rnorm(N, 0, 1) + B0 + u
-
-    df <- data.table(x=x,k=k,y=y,g=g)
-
-    w.model <- glm(x ~ k,df, family=binomial(link='logit')) 
-
-    if( m < N){
-        df <- df[sample(nrow(df), m), x.obs := x]
-    } else {
-        df <- df[, x.obs := x]
-    }
-
-    df[, x.obs := x.obs]
-
-    w <- predict(w.model, df) + rnorm(N, 0, 1)
-    ## y = B0 + B1x + e
-
-    df[,':='(w=w, w_pred = as.integer(w>0.5),u=u)]
-    return(df)
-}
-
-## simulate_latent_cocause_2 <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
-##     set.seed(seed)
-
-##     ## the true value of x
-
-##     g <- rbinom(N, 1, 0.5)
-
-##     # make w and y dependent
-##     u <- rnorm(N,0,5)
-
-##     xprime <- Bgx * g + rnorm(N,0,1)
-
-##     k <- Bkx*xprime + rnorm(N,0,3)
-
-##     x <- as.integer(logistic(scale(xprime+0.3)) > 0.5)
-
-##     y <-  Bxy * x  + Bgy * g + rnorm(N, 0, 1) + B0 + u
-
-##     df <- data.table(x=x,k=k,y=y,g=g)
-
-##     w.model <- glm(x ~ k, df, family=binomial(link='logit')) 
-
-##     if( m < N){
-##         df <- df[sample(nrow(df), m), x.obs := x]
-##     } else {
-##         df <- df[, x.obs := x]
-##     }
-
-##     w <- predict(w.model,data.frame(k=k)) + u 
-##     ## y = B0 + B1x + e
-
-##     df[,':='(w=w, w_pred = as.integer(w>0.5),u=u)]
-##     return(df)
-## }
-
-
-schennach <- function(df){
-
-    fwx <- glm(x.obs~w, df, family=binomial(link='logit'))
-    df[,xstar_pred := predict(fwx, df)]
-    gxt <- lm(y ~ xstar_pred+g, df)
-
-}
-
-
-parser <- arg_parser("Simulate data and fit corrected models")
-parser <- add_argument(parser, "--N", default=5000, help="number of observations of w")
-parser <- add_argument(parser, "--m", default=200, help="m the number of ground truth observations")
-parser <- add_argument(parser, "--seed", default=432, help='seed for the rng')
-parser <- add_argument(parser, "--outfile", help='output file', default='example_2.feather')
-args <- parse_args(parser)
-
-B0 <- 0
-Bxy <- 0.2
-Bgy <- 0
-Bkx <- 2
-Bgx <- 0
-
-
-outline <- run_simulation(simulate_latent_cocause(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed)
-                         ,list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=args$seed))
-
-outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE)
-if(file.exists(args$outfile)){
-    logdata <- read_feather(args$outfile)
-    logdata <- rbind(logdata,as.data.table(outline))
-} else {
-    logdata <- as.data.table(outline)
-}
-
-print(outline)
-write_feather(logdata, args$outfile)
-unlock(outfile_lock)