+++ /dev/null
-### EXAMPLE 2_b: demonstrates how measurement error can lead to a type sign error in a covariate
-### What kind of data invalidates fong + tyler?
-### This is the same as example 2, only instead of x->k we have k->x.
-### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
-### Even when you include the proxy variable in the regression.
-### But with some ground truth and multiple imputation, you can fix it.
-
-library(argparser)
-library(mecor)
-library(ggplot2)
-library(data.table)
-library(filelock)
-library(arrow)
-library(Amelia)
-library(Zelig)
-library(predictionError)
-options(amelia.parallel="no",
- amelia.ncpus=1)
-setDTthreads(40)
-
-source("simulation_base.R")
-
-## SETUP:
-### we want to estimate x -> y; x is MAR
-### we have x -> k; k -> w; x -> w is used to predict x via the model w.
-### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments
-### The labels x are binary, but the model provides a continuous predictor
-
-### simulation:
-#### how much power do we get from the model in the first place? (sweeping N and m)
-####
-
-## one way to do it is by adding correlation to x.obs and y that isn't in w.
-## in other words, the model is missing an important feature of x.obs that's related to y.
-simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
- set.seed(seed)
-
- ## the true value of x
-
- g <- rbinom(N, 1, 0.5)
-
- # make w and y dependent
- u <- rnorm(N,0,Bxy)
-
- xprime <- Bgx * g + rnorm(N,0,1)
-
- k <- Bkx*xprime + rnorm(N,0,1.5) + 1.1*Bkx*u
-
- x <- as.integer(logistic(scale(xprime)) > 0.5)
-
- y <- Bxy * x + Bgy * g + rnorm(N, 0, 1) + B0 + u
-
- df <- data.table(x=x,k=k,y=y,g=g)
-
- w.model <- glm(x ~ k,df, family=binomial(link='logit'))
-
- if( m < N){
- df <- df[sample(nrow(df), m), x.obs := x]
- } else {
- df <- df[, x.obs := x]
- }
-
- df[, x.obs := x.obs]
-
- w <- predict(w.model, df) + rnorm(N, 0, 1)
- ## y = B0 + B1x + e
-
- df[,':='(w=w, w_pred = as.integer(w>0.5),u=u)]
- return(df)
-}
-
-## simulate_latent_cocause_2 <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
-## set.seed(seed)
-
-## ## the true value of x
-
-## g <- rbinom(N, 1, 0.5)
-
-## # make w and y dependent
-## u <- rnorm(N,0,5)
-
-## xprime <- Bgx * g + rnorm(N,0,1)
-
-## k <- Bkx*xprime + rnorm(N,0,3)
-
-## x <- as.integer(logistic(scale(xprime+0.3)) > 0.5)
-
-## y <- Bxy * x + Bgy * g + rnorm(N, 0, 1) + B0 + u
-
-## df <- data.table(x=x,k=k,y=y,g=g)
-
-## w.model <- glm(x ~ k, df, family=binomial(link='logit'))
-
-## if( m < N){
-## df <- df[sample(nrow(df), m), x.obs := x]
-## } else {
-## df <- df[, x.obs := x]
-## }
-
-## w <- predict(w.model,data.frame(k=k)) + u
-## ## y = B0 + B1x + e
-
-## df[,':='(w=w, w_pred = as.integer(w>0.5),u=u)]
-## return(df)
-## }
-
-
-schennach <- function(df){
-
- fwx <- glm(x.obs~w, df, family=binomial(link='logit'))
- df[,xstar_pred := predict(fwx, df)]
- gxt <- lm(y ~ xstar_pred+g, df)
-
-}
-
-
-parser <- arg_parser("Simulate data and fit corrected models")
-parser <- add_argument(parser, "--N", default=5000, help="number of observations of w")
-parser <- add_argument(parser, "--m", default=200, help="m the number of ground truth observations")
-parser <- add_argument(parser, "--seed", default=432, help='seed for the rng')
-parser <- add_argument(parser, "--outfile", help='output file', default='example_2.feather')
-args <- parse_args(parser)
-
-B0 <- 0
-Bxy <- 0.2
-Bgy <- 0
-Bkx <- 2
-Bgx <- 0
-
-
-outline <- run_simulation(simulate_latent_cocause(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed)
- ,list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=args$seed))
-
-outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE)
-if(file.exists(args$outfile)){
- logdata <- read_feather(args$outfile)
- logdata <- rbind(logdata,as.data.table(outline))
-} else {
- logdata <- as.data.table(outline)
-}
-
-print(outline)
-write_feather(logdata, args$outfile)
-unlock(outfile_lock)