make first simulation with precise accuracies and explained variances

[ml_measurement_error_public.git] / simulations / 02_indep_differential.R
diff --git a/simulations/example_3.R b/simulations/02_indep_differential.R

similarity index 70%

rename from simulations/example_3.R

rename to simulations/02_indep_differential.R

index 389dba1c1a8536f2cf03f6539a36452ecc253478..5a7784b65a03ef307be991c1f51a81f631f8e743 100644 (file)
--- a/simulations/example_3.R
+++ b/simulations/02_indep_differential.R
@@ -1,6 +1,5 @@
-### EXAMPLE 2_b: demonstrates how measurement error can lead to a type sign error in a covariate
+### EXAMPLE 1: demonstrates how measurement error can lead to a type sign error in a covariate
  ### What kind of data invalidates fong + tyler?
-### This is the same as example 2, only instead of x->k we have k->x.
  ### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
  ### Even when you include the proxy variable in the regression.
  ### But with some ground truth and multiple imputation, you can fix it.
@@ -32,7 +31,7 @@ source("simulation_base.R")
  
  ## one way to do it is by adding correlation to x.obs and y that isn't in w.
  ## in other words, the model is missing an important feature of x.obs that's related to y.
-simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
+simulate_data <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed, xy.explained.variance=0.01, u.explained.variance=0.1){
      set.seed(seed)
  
      ## the true value of x
@@ -40,7 +39,7 @@ simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
      g <- rbinom(N, 1, 0.5)
  
      # make w and y dependent
-    u <- rnorm(N,0,Bxy)
+    u <- rnorm(N,0,)
  
      xprime <- Bgx * g + rnorm(N,0,1) 
  
@@ -48,7 +47,7 @@ simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
  
      x <- as.integer(logistic(scale(xprime)) > 0.5)
  
-    y <-  Bxy * x  + Bgy * g + rnorm(N, 0, 1) + B0 + u
+    y <-  Bxy * x  + Bgy * g + B0 + u + rnorm(N, 0, 1)
  
      df <- data.table(x=x,k=k,y=y,g=g)
  
@@ -69,42 +68,6 @@ simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
      return(df)
  }
  
-## simulate_latent_cocause_2 <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
-##     set.seed(seed)
-
-##     ## the true value of x
-
-##     g <- rbinom(N, 1, 0.5)
-
-##     # make w and y dependent
-##     u <- rnorm(N,0,5)
-
-##     xprime <- Bgx * g + rnorm(N,0,1)
-
-##     k <- Bkx*xprime + rnorm(N,0,3)
-
-##     x <- as.integer(logistic(scale(xprime+0.3)) > 0.5)
-
-##     y <-  Bxy * x  + Bgy * g + rnorm(N, 0, 1) + B0 + u
-
-##     df <- data.table(x=x,k=k,y=y,g=g)
-
-##     w.model <- glm(x ~ k, df, family=binomial(link='logit')) 
-
-##     if( m < N){
-##         df <- df[sample(nrow(df), m), x.obs := x]
-##     } else {
-##         df <- df[, x.obs := x]
-##     }
-
-##     w <- predict(w.model,data.frame(k=k)) + u 
-##     ## y = B0 + B1x + e
-
-##     df[,':='(w=w, w_pred = as.integer(w>0.5),u=u)]
-##     return(df)
-## }
-
-
  schennach <- function(df){
  
      fwx <- glm(x.obs~w, df, family=binomial(link='logit'))
@@ -128,7 +91,7 @@ Bkx <- 2
  Bgx <- 0
  
  
-outline <- run_simulation(simulate_latent_cocause(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed)
+outline <- run_simulation(simulate_data(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed)
                           ,list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=args$seed))
  
  outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE)