simulations/example_3.R

   1 ### EXAMPLE 2_b: demonstrates how measurement error can lead to a type sign error in a covariate
   2 ### What kind of data invalidates fong + tyler?
   3 ### This is the same as example 2, only instead of x->k we have k->x.
   4 ### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
   5 ### Even when you include the proxy variable in the regression.
   6 ### But with some ground truth and multiple imputation, you can fix it.
   7
   8 library(argparser)
   9 library(mecor)
  10 library(ggplot2)
  11 library(data.table)
  12 library(filelock)
  13 library(arrow)
  14 library(Amelia)
  15 library(Zelig)
  16 library(predictionError)
  17 options(amelia.parallel="no",
  18         amelia.ncpus=1)
  19 setDTthreads(40)
  20
  21 source("simulation_base.R")
  22
  23 ## SETUP:
  24 ### we want to estimate x -> y; x is MAR
  25 ### we have x -> k; k -> w; x -> w is used to predict x via the model w.
  26 ### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments
  27 ### The labels x are binary, but the model provides a continuous predictor
  28
  29 ### simulation:
  30 #### how much power do we get from the model in the first place? (sweeping N and m)
  31 ####
  32
  33 ## one way to do it is by adding correlation to x.obs and y that isn't in w.
  34 ## in other words, the model is missing an important feature of x.obs that's related to y.
  35 simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
  36     set.seed(seed)
  37
  38     ## the true value of x
  39
  40     g <- rbinom(N, 1, 0.5)
  41
  42     # make w and y dependent
  43     u <- rnorm(N,0,Bxy)
  44
  45     xprime <- Bgx * g + rnorm(N,0,1)
  46
  47     k <- Bkx*xprime + rnorm(N,0,1.5) + 1.1*Bkx*u
  48
  49     x <- as.integer(logistic(scale(xprime)) > 0.5)
  50
  51     y <-  Bxy * x  + Bgy * g + rnorm(N, 0, 1) + B0 + u
  52
  53     df <- data.table(x=x,k=k,y=y,g=g)
  54
  55     w.model <- glm(x ~ k,df, family=binomial(link='logit'))
  56
  57     if( m < N){
  58         df <- df[sample(nrow(df), m), x.obs := x]
  59     } else {
  60         df <- df[, x.obs := x]
  61     }
  62
  63     df[, x.obs := x.obs]
  64
  65     w <- predict(w.model, df) + rnorm(N, 0, 1)
  66     ## y = B0 + B1x + e
  67
  68     df[,':='(w=w, w_pred = as.integer(w>0.5),u=u)]
  69     return(df)
  70 }
  71
  72 ## simulate_latent_cocause_2 <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
  73 ##     set.seed(seed)
  74
  75 ##     ## the true value of x
  76
  77 ##     g <- rbinom(N, 1, 0.5)
  78
  79 ##     # make w and y dependent
  80 ##     u <- rnorm(N,0,5)
  81
  82 ##     xprime <- Bgx * g + rnorm(N,0,1)
  83
  84 ##     k <- Bkx*xprime + rnorm(N,0,3)
  85
  86 ##     x <- as.integer(logistic(scale(xprime+0.3)) > 0.5)
  87
  88 ##     y <-  Bxy * x  + Bgy * g + rnorm(N, 0, 1) + B0 + u
  89
  90 ##     df <- data.table(x=x,k=k,y=y,g=g)
  91
  92 ##     w.model <- glm(x ~ k, df, family=binomial(link='logit'))
  93
  94 ##     if( m < N){
  95 ##         df <- df[sample(nrow(df), m), x.obs := x]
  96 ##     } else {
  97 ##         df <- df[, x.obs := x]
  98 ##     }
  99
 100 ##     w <- predict(w.model,data.frame(k=k)) + u
 101 ##     ## y = B0 + B1x + e
 102
 103 ##     df[,':='(w=w, w_pred = as.integer(w>0.5),u=u)]
 104 ##     return(df)
 105 ## }
 106
 107
 108 schennach <- function(df){
 109
 110     fwx <- glm(x.obs~w, df, family=binomial(link='logit'))
 111     df[,xstar_pred := predict(fwx, df)]
 112     gxt <- lm(y ~ xstar_pred+g, df)
 113
 114 }
 115
 116
 117 parser <- arg_parser("Simulate data and fit corrected models")
 118 parser <- add_argument(parser, "--N", default=5000, help="number of observations of w")
 119 parser <- add_argument(parser, "--m", default=200, help="m the number of ground truth observations")
 120 parser <- add_argument(parser, "--seed", default=432, help='seed for the rng')
 121 parser <- add_argument(parser, "--outfile", help='output file', default='example_2.feather')
 122 args <- parse_args(parser)
 123
 124 B0 <- 0
 125 Bxy <- 0.2
 126 Bgy <- 0
 127 Bkx <- 2
 128 Bgx <- 0
 129
 130
 131 outline <- run_simulation(simulate_latent_cocause(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed)
 132                          ,list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=args$seed))
 133
 134 outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE)
 135 if(file.exists(args$outfile)){
 136     logdata <- read_feather(args$outfile)
 137     logdata <- rbind(logdata,as.data.table(outline))
 138 } else {
 139     logdata <- as.data.table(outline)
 140 }
 141
 142 print(outline)
 143 write_feather(logdata, args$outfile)
 144 unlock(outfile_lock)