Added, but didn't test the remaining robustness checks.

author Nathan TeBlunthuis <nathante@uw.edu>

Mon, 12 Dec 2022 06:46:30 +0000 (22:46 -0800)

committer Nathan TeBlunthuis <nathante@uw.edu>

Mon, 12 Dec 2022 06:46:30 +0000 (22:46 -0800)
author Nathan TeBlunthuis <nathante@uw.edu>
Mon, 12 Dec 2022 06:46:30 +0000 (22:46 -0800)
committer Nathan TeBlunthuis <nathante@uw.edu>
Mon, 12 Dec 2022 06:46:30 +0000 (22:46 -0800)
diff --git a/simulations/01_two_covariates.R b/simulations/01_two_covariates.R

index b8f9317352d5867851503c90b6d538227f829ad1..cd688c7d4b34d2456302299bc284cfedceb2c3f3 100644 (file)
--- a/simulations/01_two_covariates.R
+++ b/simulations/01_two_covariates.R
@@ -30,11 +30,11 @@ source("simulation_base.R")
  #### how much power do we get from the model in the first place? (sweeping N and m)
  #### 
  
-simulate_data <- function(N, m, B0=0, Bxy=0.2, Bzy=-0.2, Bzx=0.2, y_explained_variance=0.025, prediction_accuracy=0.73, seed=1){
+simulate_data <- function(N, m, B0=0, Bxy=0.2, Bzy=-0.2, Bzx=0.2, y_explained_variance=0.025, prediction_accuracy=0.73, Px=0.5, seed=1){
      set.seed(seed)
      z <- rnorm(N,sd=0.5)
                                          #    x.var.epsilon <- var(Bzx *z) * ((1-zx_explained_variance)/zx_explained_variance)
-    xprime <- Bzx * z #+ x.var.epsilon
+    xprime <- Bzx * z + qlogis(Px)
      x <- rbinom(N,1,plogis(xprime))
  
      y.var.epsilon <- (var(Bzy * z) + var(Bxy *x) + 2*cov(Bxy*x,Bzy*z)) * ((1-y_explained_variance)/y_explained_variance)
@@ -78,16 +78,18 @@ parser <- add_argument(parser, "--truth_formula", help='formula for the true var
  parser <- add_argument(parser, "--Bzx", help='Effect of z on x', default=0.3)
  parser <- add_argument(parser, "--Bzy", help='Effect of z on y', default=-0.3)
  parser <- add_argument(parser, "--Bxy", help='Effect of x on y', default=0.3)
+parser <- add_argument(parser, "--Px", help='Base rate of x', default=0.5)
  
  args <- parse_args(parser)
  B0 <- 0
+Px <- args$Px
  Bxy <- args$Bxy
  Bzy <- args$Bzy
  Bzx <- args$Bzx
  
-df <- simulate_data(args$N, args$m, B0, Bxy, Bzy, Bzx, seed=args$seed + 500, y_explained_variance = args$y_explained_variance,  prediction_accuracy=args$prediction_accuracy)
+df <- simulate_data(args$N, args$m, B0, Bxy, Bzy, Bzx, Px, seed=args$seed + 500, y_explained_variance = args$y_explained_variance,  prediction_accuracy=args$prediction_accuracy)
  
-result <- list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy, Bzx=Bzx, 'Bzy'=Bzy, 'seed'=args$seed, 'y_explained_variance'=args$y_explained_variance, 'prediction_accuracy'=args$prediction_accuracy, 'accuracy_imbalance_difference'=args$accuracy_imbalance_difference, 'outcome_formula'=args$outcome_formula, 'proxy_formula'=args$proxy_formula,truth_formula=args$truth_formula, error='')
+result <- list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy, Bzx=Bzx, 'Bzy'=Bzy, 'Px'=Px, 'seed'=args$seed, 'y_explained_variance'=args$y_explained_variance, 'prediction_accuracy'=args$prediction_accuracy, 'accuracy_imbalance_difference'=args$accuracy_imbalance_difference, 'outcome_formula'=args$outcome_formula, 'proxy_formula'=args$proxy_formula,truth_formula=args$truth_formula, error='')
  
  outline <- run_simulation(df, result, outcome_formula=as.formula(args$outcome_formula), proxy_formula=as.formula(args$proxy_formula), truth_formula=as.formula(args$truth_formula))
      
diff --git a/simulations/02_indep_differential.R b/simulations/02_indep_differential.R

index 5d34312a46f250dfc05acef42cbcfd1b428245dc..80e19be78f8badbad9eb3e557ea6de5a356da384 100644 (file)
--- a/simulations/02_indep_differential.R
+++ b/simulations/02_indep_differential.R
@@ -31,11 +31,11 @@ source("simulation_base.R")
  
  ## one way to do it is by adding correlation to x.obs and y that isn't in w.
  ## in other words, the model is missing an important feature of x.obs that's related to y.
-simulate_data <- function(N, m, B0, Bxy, Bzx, Bzy, seed, y_explained_variance=0.025, prediction_accuracy=0.73, y_bias=-0.8,z_bias=0,accuracy_imbalance_difference=0.3){
+simulate_data <- function(N, m, B0, Bxy, Bzx, Bzy, seed, y_explained_variance=0.025, prediction_accuracy=0.73, y_bias=-0.8,z_bias=0,Px=0.5,accuracy_imbalance_difference=0.3){
      set.seed(seed)
      # make w and y dependent
      z <- rnorm(N,sd=0.5)
-    x <- rbinom(N, 1, plogis(Bzx * z))
+    x <- rbinom(N, 1, plogis(Bzx * z + qlogis(Px)))
  
      y.var.epsilon <- (var(Bzy * z) + var(Bxy *x) + 2*cov(Bzy*z,Bxy*x)) * ((1-y_explained_variance)/y_explained_variance)
      y.epsilon <- rnorm(N, sd = sqrt(y.var.epsilon))
@@ -140,10 +140,12 @@ parser <- add_argument(parser, "--proxy_formula", help='formula for the proxy va
  parser <- add_argument(parser, "--y_bias", help='coefficient of y on the probability a classification is correct', default=-0.5)
  parser <- add_argument(parser, "--z_bias", help='coefficient of z on the probability a classification is correct', default=0)
  parser <- add_argument(parser, "--truth_formula", help='formula for the true variable', default="x~z")
+parser <- add_argument(parser, "--Px", help='base rate of x', default=0.5)
  
  args <- parse_args(parser)
  
  B0 <- 0
+Px <- args$Px
  Bxy <- args$Bxy
  Bzy <- args$Bzy
  Bzx <- args$Bzx
@@ -157,7 +159,7 @@ if(args$m < args$N){
      ## pc.df <- pc(suffStat=list(C=cor(df.pc),n=nrow(df.pc)),indepTest=gaussCItest,labels=names(df.pc),alpha=0.05)
      ## plot(pc.df)
  
-    result <- list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy, Bzx=args$Bzx, 'Bzy'=Bzy, 'seed'=args$seed, 'y_explained_variance'=args$y_explained_variance, 'prediction_accuracy'=args$prediction_accuracy, 'accuracy_imbalance_difference'=args$accuracy_imbalance_difference, 'y_bias'=args$y_bias,'outcome_formula'=args$outcome_formula, 'proxy_formula'=args$proxy_formula,truth_formula=args$truth_formula, error='')
+    result <- list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy, 'Bzx'=args$Bzx, 'Bzy'=Bzy, 'Px'=Px, .'seed'=args$seed, 'y_explained_variance'=args$y_explained_variance, 'prediction_accuracy'=args$prediction_accuracy, 'accuracy_imbalance_difference'=args$accuracy_imbalance_difference, 'y_bias'=args$y_bias,'outcome_formula'=args$outcome_formula, 'proxy_formula'=args$proxy_formula,truth_formula=args$truth_formula, error='')
  
      outline <- run_simulation(df, result, outcome_formula=as.formula(args$outcome_formula), proxy_formula=as.formula(args$proxy_formula), truth_formula=as.formula(args$truth_formula))
      
diff --git a/simulations/03_depvar.R b/simulations/03_depvar.R

index dde1beec9ec7800d3850947a77000b3ec8187f4f..f0064f2940d93257b6ecfa481db7fa2b4f3c47ff 100644 (file)
--- a/simulations/03_depvar.R
+++ b/simulations/03_depvar.R
@@ -76,12 +76,13 @@ parser <- add_argument(parser, "--prediction_accuracy", help='how accurate is th
  parser <- add_argument(parser, "--Bxy", help='coefficient of x on y', default=0.01)
  parser <- add_argument(parser, "--Bzy", help='coeffficient of z on y', default=-0.01)
  parser <- add_argument(parser, "--Bzx", help='coeffficient of z on x', default=-0.5)
+parser <- add_argument(parser, "--B0", help='Base rate of y', default=0.5)
  parser <- add_argument(parser, "--outcome_formula", help='formula for the outcome variable', default="y~x+z")
  parser <- add_argument(parser, "--proxy_formula", help='formula for the proxy variable', default="w_pred~y")
  
  args <- parse_args(parser)
  
-B0 <- 0
+B0 <- args$B0
  Bxy <- args$Bxy
  Bzy <- args$Bzy
  Bzx <- args$Bzx
diff --git a/simulations/03_depvar_differential.R b/simulations/03_depvar_differential.R

index 7b920ba6af6c7b6bfe7ed1cf962910767ab8e384..02944a51c0306d67c85a7cc79da2f903dc09a188 100644 (file)
--- a/simulations/03_depvar_differential.R
+++ b/simulations/03_depvar_differential.R
@@ -31,14 +31,14 @@ source("simulation_base.R")
  
  ## one way to do it is by adding correlation to x.obs and y that isn't in w.
  ## in other words, the model is missing an important feature of x.obs that's related to y.
-simulate_data <- function(N, m, B0, Bxy, Bzy, seed, prediction_accuracy=0.73, x_bias=-0.75){
+simulate_data <- function(N, m, B0, Bxy, Bzy, Py, seed, prediction_accuracy=0.73, x_bias=-0.75){
      set.seed(seed)
  
      # make w and y dependent
      z <- rbinom(N, 1, 0.5)
      x <- rbinom(N, 1, 0.5)
  
-    ystar <- Bzy * z + Bxy * x + B0
+    ystar <- Bzy * z + Bxy * x + B0 + qlogix(Py)
      y <- rbinom(N,1,plogis(ystar))
  
      # glm(y ~ x + z, family="binomial")
@@ -77,6 +77,7 @@ parser <- add_argument(parser, "--prediction_accuracy", help='how accurate is th
  parser <- add_argument(parser, "--x_bias", help='how is the classifier biased?', default=0.75)
  parser <- add_argument(parser, "--Bxy", help='coefficient of x on y', default=0.3)
  parser <- add_argument(parser, "--Bzy", help='coeffficient of z on y', default=-0.3)
+parser <- add_argument(parser, "--Py", help='Base rate of y', default=0.5)
  parser <- add_argument(parser, "--outcome_formula", help='formula for the outcome variable', default="y~x+z")
  parser <- add_argument(parser, "--proxy_formula", help='formula for the proxy variable', default="w_pred~y*x")
  
diff --git a/simulations/04_depvar_differential.R b/simulations/04_depvar_differential.R

index df0e825d3d91c543bf8b257c6d6fe56fc65027e0..b3670807549249256da52cfc1fd0cb6ea8662b90 100644 (file)
--- a/simulations/04_depvar_differential.R
+++ b/simulations/04_depvar_differential.R
@@ -76,12 +76,13 @@ parser <- add_argument(parser, "--prediction_accuracy", help='how accurate is th
  parser <- add_argument(parser, "--z_bias", help='how is the classifier biased?', default=1.5)
  parser <- add_argument(parser, "--Bxy", help='coefficient of x on y', default=0.1)
  parser <- add_argument(parser, "--Bzy", help='coeffficient of z on y', default=-0.1)
+parser <- add_argument(parser, "--B0", help='coeffficient of z on y', default=-0.1)
  parser <- add_argument(parser, "--outcome_formula", help='formula for the outcome variable', default="y~x+z")
  parser <- add_argument(parser, "--proxy_formula", help='formula for the proxy variable', default="w_pred~y+z")
  
  args <- parse_args(parser)
  
-B0 <- 0
+B0 <- args$B0
  Bxy <- args$Bxy
  Bzy <- args$Bzy
  
diff --git a/simulations/Makefile b/simulations/Makefile

index e6a3bbe6d5823de6471110b81ed368a64fce2990..feeeaa54dbc1f311152a9f44c026e3fca49c54d7 100644 (file)
--- a/simulations/Makefile
+++ b/simulations/Makefile
@@ -148,21 +148,204 @@ robustness_1_dv.feather: robustness_1_dv_jobs
  robustness_1_dv.RDS: robustness_1_dv.feather
         rm -f $@
         ${srun} Rscript plot_dv_example.R --infile $< --name "robustness_1_dv" --outfile $@
- 
  
-robustness_2_jobs: grid_sweep.py 01_two_covariates.R simulation_base.R grid_sweep.py
+
+robustness_2_jobs_p1: grid_sweep.py 01_two_covariates.R simulation_base.R grid_sweep.py
+       rm -f $@
+       ${srun} $< --command 'Rscript 01_two_covariates.R'  --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2.feather"],"y_explained_variance":${explained_variances},  "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.60,0.65]}' --outfile $@
+
+robustness_2_jobs_p2: grid_sweep.py 01_two_covariates.R simulation_base.R grid_sweep.py
+       rm -f $@
+       ${srun} $< --command 'Rscript 01_two_covariates.R'  --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2.feather"],"y_explained_variance":${explained_variances},  "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.70,0.75]}' --outfile $@
+
+robustness_2_jobs_p3: grid_sweep.py 01_two_covariates.R simulation_base.R grid_sweep.py
+       rm -f $@
+       ${srun} $< --command 'Rscript 01_two_covariates.R'  --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2.feather"],"y_explained_variance":${explained_variances},  "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.80,0.85]}' --outfile $@
+
+robustness_2_jobs_p4: grid_sweep.py 01_two_covariates.R simulation_base.R grid_sweep.py
+       rm -f $@
+       ${srun} $< --command 'Rscript 01_two_covariates.R'  --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2.feather"],"y_explained_variance":${explained_variances},  "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.90,0.95]}' --outfile $@
+
+START=0
+END_1=$(shell cat robustness_2_jobs_p1 | wc -l)
+END_2=$(shell cat robustness_2_jobs_p2 | wc -l)
+END_3=$(shell cat robustness_2_jobs_p3 | wc -l)
+END_4=$(shell cat robustness_2_jobs_p4 | wc -l)
+STEP=1000
+ONE=1
+ITEMS_1=$(shell seq $(START) $(STEP) $(END_1))
+ITEMS_2=$(shell seq $(START) $(STEP) $(END_2))
+ITEMS_3=$(shell seq $(START) $(STEP) $(END_3))
+ITEMS_4=$(shell seq $(START) $(STEP) $(END_4))
+
+robustness_2.feather: robustness_2_jobs_p1 robustness_2_jobs_p2 robustness_2_jobs_p3 robustness_2_jobs_p4
+       $(foreach item,$(ITEMS_1),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_2_jobs_p1)
+       $(foreach item,$(ITEMS_2),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_2_jobs_p2;)
+       $(foreach item,$(ITEMS_3),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_2_jobs_p3;)
+       $(foreach item,$(ITEMS_4),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_2_jobs_p4;)
+
+
+robustness_2_dv_jobs_p1: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py
+       rm -f $@
+       ${srun} $< --command 'Rscript 03_depvar.R'  --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2.feather"],"y_explained_variance":${explained_variances},  "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.60,0.65]}' --outfile $@
+
+robustness_2_dv_jobs_p2: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py
+       rm -f $@
+       ${srun} $< --command 'Rscript 03_depvar.R'  --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2.feather"],"y_explained_variance":${explained_variances},  "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.70,0.75]}' --outfile $@
+
+robustness_2_dv_jobs_p3: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py
+       rm -f $@
+       ${srun} $< --command 'Rscript 03_depvar.R'  --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2.feather"],"y_explained_variance":${explained_variances},  "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.80,0.85]}' --outfile $@
+
+robustness_2_dv_jobs_p4: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py
+       rm -f $@
+       ${srun} $< --command 'Rscript 01_two_covariates.R'  --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2.feather"],"y_explained_variance":${explained_variances},  "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.90,0.95]}' --outfile $@
+
+START=0
+END_1=$(shell cat robustness_2_dv_jobs_p1 | wc -l)
+END_2=$(shell cat robustness_2_dv_jobs_p2 | wc -l)
+END_3=$(shell cat robustness_2_dv_jobs_p3 | wc -l)
+END_4=$(shell cat robustness_2_dv_jobs_p4 | wc -l)
+STEP=1000
+ONE=1
+ITEMS_1=$(shell seq $(START) $(STEP) $(END_1))
+ITEMS_2=$(shell seq $(START) $(STEP) $(END_2))
+ITEMS_3=$(shell seq $(START) $(STEP) $(END_3))
+ITEMS_4=$(shell seq $(START) $(STEP) $(END_4))
+
+robustness_2_dv.feather: robustness_2_dv_jobs_p1 robustness_2_dv_jobs_p2 robustness_2_dv_jobs_p3 robustness_2_dv_jobs_p4
+       $(foreach item,$(ITEMS_1),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_2_dv_jobs_p1)
+       $(foreach item,$(ITEMS_2),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_2_dv_jobs_p2;)
+       $(foreach item,$(ITEMS_3),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_2_dv_jobs_p3;)
+       $(foreach item,$(ITEMS_4),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_2_dv_jobs_p4;)
+
+
+
+robustness_3_jobs_p1: grid_sweep.py 01_two_covariates.R simulation_base.R grid_sweep.py
+       rm -f $@
+       ${srun} $< --command 'Rscript 01_two_covariates.R'  --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_3.feather"],"y_explained_variance":${explained_variances},  "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3],"Px":[0.5,0.6], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.85]}' --outfile $@
+
+robustness_3_jobs_p2: grid_sweep.py 01_two_covariates.R simulation_base.R grid_sweep.py
+       rm -f $@
+       ${srun} $< --command 'Rscript 01_two_covariates.R'  --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_3.feather"],"y_explained_variance":${explained_variances},  "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3],"Px":[0.7,0.8], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.85]}' --outfile $@
+
+robustness_3_jobs_p3: grid_sweep.py 01_two_covariates.R simulation_base.R grid_sweep.py
+       rm -f $@
+       ${srun} $< --command 'Rscript 01_two_covariates.R'  --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_3.feather"],"y_explained_variance":${explained_variances},  "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3],"Px":[0.9,0.95], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.85]}' --outfile $@
+
+START=0
+END_1=$(shell cat robustness_3_jobs_p1 | wc -l)
+END_2=$(shell cat robustness_3_jobs_p2 | wc -l)
+END_3=$(shell cat robustness_3_jobs_p3 | wc -l)
+
+STEP=1000
+ONE=1
+ITEMS_1=$(shell seq $(START) $(STEP) $(END_1))
+ITEMS_2=$(shell seq $(START) $(STEP) $(END_2))
+ITEMS_3=$(shell seq $(START) $(STEP) $(END_3))
+
+robustness_3.feather: robustness_3_jobs_p1 robustness_3_jobs_p2 robustness_3_jobs_p3
+       $(foreach item,$(ITEMS_1),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_3_jobs_p1)
+       $(foreach item,$(ITEMS_2),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_3_jobs_p2;)
+       $(foreach item,$(ITEMS_3),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_3_jobs_p3;)
+
+
+robustness_3_dv_jobs_p1: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py
         rm -f $@
-       ${srun} $< --command 'Rscript 01_two_covariates.R'  --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2.feather"],"y_explained_variance":${explained_variances},  "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.6,0.73,0.8,0.85,0.9,0.95]}' --outfile $@
+       ${srun} $< --command 'Rscript 03_depvar.R'  --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_3.feather"],"y_explained_variance":${explained_variances},  "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3],"B0":[0.5,0.6], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.85]}' --outfile $@
  
+robustness_3_dv_jobs_p2: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py
+       rm -f $@
+       ${srun} $< --command 'Rscript 03_depvar.R'  --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_3.feather"],"y_explained_variance":${explained_variances},  "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3],"B0":[0.7,0.8], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.85]}' --outfile $@
  
+robustness_3_dv_jobs_p3: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py
+       rm -f $@
+       ${srun} $< --command 'Rscript 03_depvar.R'  --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_3.feather"],"y_explained_variance":${explained_variances},  "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "B0":[0.9,0.95], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.85]}' --outfile $@
+
+START=0
+END_1=$(shell cat robustness_3_dv_jobs_p1 | wc -l)
+END_2=$(shell cat robustness_3_dv_jobs_p2 | wc -l)
+END_3=$(shell cat robustness_3_dv_jobs_p3 | wc -l)
  
-START=1
-END=$(shell cat robustness_2_jobs | wc -l)
  STEP=1000
-ITEMS=$(shell seq $(START) $(STEP) $(END))
+ONE=1
+ITEMS_1=$(shell seq $(START) $(STEP) $(END_1))
+ITEMS_2=$(shell seq $(START) $(STEP) $(END_2))
+ITEMS_3=$(shell seq $(START) $(STEP) $(END_3))
+
+robustness_3_dv.feather: robustness_3_dv_jobs_p1 robustness_3_dv_jobs_p2 robustness_3_dv_jobs_p3
+       $(foreach item,$(ITEMS_1),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_3_dv_jobs_p1)
+       $(foreach item,$(ITEMS_2),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_3_dv_jobs_p2;)
+       $(foreach item,$(ITEMS_3),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_3_dv_jobs_p3;)
+
  
-robustness_2.feather: robustness_2_jobs
-       $(foreach item,$(ITEMS),sbatch --wait --verbose --array=$(shell expr $(item))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 $<)
+
+robustness_4_jobs_p1: grid_sweep.py 02_indep_differential.R simulation_base.R grid_sweep.py
+       rm -f $@
+       ${srun} $< --command 'Rscript 02_indep_differential.R'  --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_4.feather"],"y_explained_variance":${explained_variances},  "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.85],y_bias=[-1,-0.85]}' --outfile $@
+
+robustness_4_jobs_p2: grid_sweep.py 02_indep_differential.R simulation_base.R grid_sweep.py
+       rm -f $@
+       ${srun} $< --command 'Rscript 02_indep_differential.R'  --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_4.feather"],"y_explained_variance":${explained_variances},  "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.85], y_bias=[-0.70,-0.55]}' --outfile $@
+
+robustness_4_jobs_p3: grid_sweep.py 02_indep_differential.R simulation_base.R grid_sweep.py
+       rm -f $@
+       ${srun} $< --command 'Rscript 02_indep_differential.R'  --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_4.feather"],"y_explained_variance":${explained_variances},  "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.85],y_bias=[-0.4,-0.25]}' --outfile $@
+
+robustness_4_jobs_p4: grid_sweep.py 02_indep_differential.R simulation_base.R grid_sweep.py
+       rm -f $@
+       ${srun} $< --command 'Rscript 02_indep_differential.R'  --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_4.feather"],"y_explained_variance":${explained_variances},  "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.85],y_bias=[-0.1,0]}' --outfile $@
+
+START=0
+END_1=$(shell cat robustness_4_jobs_p1 | wc -l)
+END_2=$(shell cat robustness_4_jobs_p2 | wc -l)
+END_3=$(shell cat robustness_4_jobs_p3 | wc -l)
+END_4=$(shell cat robustness_4_jobs_p3 | wc -l)
+
+STEP=1000
+ONE=1
+ITEMS_1=$(shell seq $(START) $(STEP) $(END_1))
+ITEMS_2=$(shell seq $(START) $(STEP) $(END_2))
+ITEMS_3=$(shell seq $(START) $(STEP) $(END_3))
+ITEMS_4=$(shell seq $(START) $(STEP) $(END_4))
+
+robustness_4.feather: robustness_4_jobs_p1 robustness_4_jobs_p2 robustness_4_jobs_p3
+       $(foreach item,$(ITEMS_1),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_4_jobs_p1)
+       $(foreach item,$(ITEMS_2),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_4_jobs_p2;)
+       $(foreach item,$(ITEMS_3),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_4_jobs_p3;)
+
+
+robustness_4_dv_jobs_p1: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py
+       rm -f $@
+       ${srun} $< --command 'Rscript 03_depvar.R'  --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_4.feather"],"y_explained_variance":${explained_variances},  "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3],"B0":[0.5] "outcome_formula":["y~x+z"], "prediction_accuracy":[0.85],z_bias=[0,0.1]}' --outfile $@
+
+robustness_4_dv_jobs_p2: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py
+       rm -f $@
+       ${srun} $< --command 'Rscript 03_depvar.R'  --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_4.feather"],"y_explained_variance":${explained_variances},  "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3],"B0":[0.5] "outcome_formula":["y~x+z"], "prediction_accuracy":[0.85],z_bias=[0.25,0.4]}' --outfile $@
+
+robustness_4_dv_jobs_p3: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py
+       rm -f $@
+       ${srun} $< --command 'Rscript 03_depvar.R'  --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_4.feather"],"y_explained_variance":${explained_variances},  "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "B0":[0.5], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.85],z_bias=[0.55,0.7]}' --outfile $@
+robustness_4_dv_jobs_p4: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py
+       rm -f $@
+       ${srun} $< --command 'Rscript 03_depvar.R'  --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_4.feather"],"y_explained_variance":${explained_variances},  "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "B0":[0.5], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.85],z_bias=[0.85,1]}' --outfile $@
+
+
+START=0
+END_1=$(shell cat robustness_4_dv_jobs_p1 | wc -l)
+END_2=$(shell cat robustness_4_dv_jobs_p2 | wc -l)
+END_3=$(shell cat robustness_4_dv_jobs_p3 | wc -l)
+
+STEP=1000
+ONE=1
+ITEMS_1=$(shell seq $(START) $(STEP) $(END_1))
+ITEMS_2=$(shell seq $(START) $(STEP) $(END_2))
+ITEMS_3=$(shell seq $(START) $(STEP) $(END_3))
+
+robustness_4_dv.feather: robustness_4_dv_jobs_p1 robustness_4_dv_jobs_p2 robustness_4_dv_jobs_p3
+       $(foreach item,$(ITEMS_1),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_4_dv_jobs_p1)
+       $(foreach item,$(ITEMS_2),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_4_dv_jobs_p2;)
+       $(foreach item,$(ITEMS_3),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_4_dv_jobs_p3;)
  
  #      
  clean:
diff --git a/simulations/robustness_check_notes.md b/simulations/robustness_check_notes.md

index 0a287a71e3c6d8be79647c53112a4e40a44095eb..1c786e994f47dd36f596942d1e2d1f5e2e8157f1 100644 (file)
--- a/simulations/robustness_check_notes.md
+++ b/simulations/robustness_check_notes.md
@@ -12,3 +12,29 @@ Like `robustness\_1.RDS` but with a less precise model for $w_pred$.  In the mai
  
  This is just example 1 with varying levels of classifier accuracy. 
  
+# robustness_2_dv.RDS
+
+Example 3 with varying levels of classifier accuracy
+
+# robustness_3.RDS
+
+Example 1 with varying levels of skewness in the classified variable. The variable `Px` is the baserate of $X$ and controls the skewness of $X$.
+It probably makes more sense to report the mean of $X$ instead of `Px` in the supplement.
+
+# robustness_3_dv.RDS
+
+Example 3 with varying levels of skewness in the classified variable. The variable `B0` is the intercept of the main model and controls the skewness of $Y$.
+It probably makes more sense to report the mean of $Y$ instead of B0 in the supplement. 
+
+# robustness_4.RDS
+
+Example 2 with varying amounts of differential error. The variable `y_bias` controls the amount of differential error.
+It probably makes more sense to report the corrleation between $Y$ and $X-~$, or the difference in accuracy from when when $Y=1$ to $Y=0$ in the supplement instead of `y_bias`.
+
+# robustness_4_dv.RDS
+
+Example 4 with varying amounts of bias. The variable `z_bias` controls the amount of differential error.
+It probably makes more sense to report the corrleation between $Z$ and $Y-W$, or the difference in accuracy from when when $Z=1$ to $Z=0$ in the supplement instead of `z_bias`.
+
+
+
diff --git a/simulations/simulation_base.R b/simulations/simulation_base.R

index 82b17a737ae05c9a98109da6164a9c75a10aebc3..08b11ec9595a49a34553b3b1eaaaa3fb1463e27f 100644 (file)
--- a/simulations/simulation_base.R
+++ b/simulations/simulation_base.R
@@ -151,10 +151,10 @@ run_simulation_depvar <- function(df, result, outcome_formula=y~x+z, proxy_formu
      temp.df <- copy(df)
      temp.df[,y:=y.obs]
      mod.caroll.lik <- measerr_mle_dv(temp.df, outcome_formula=outcome_formula, proxy_formula=proxy_formula)
-    fisher.info <- solve(mod.caroll.lik$hessian)
+    fischer.info <- solve(mod.caroll.lik$hessian)
      coef <- mod.caroll.lik$par
-    ci.upper <- coef + sqrt(diag(fisher.info)) * 1.96
-    ci.lower <- coef - sqrt(diag(fisher.info)) * 1.96
+    ci.upper <- coef + sqrt(diag(fischer.info)) * 1.96
+    ci.lower <- coef - sqrt(diag(fischer.info)) * 1.96
      result <- append(result,
                       list(Bxy.est.mle = coef['x'],
                            Bxy.ci.upper.mle = ci.upper['x'],
@@ -299,11 +299,32 @@ run_simulation <-  function(df, result, outcome_formula=y~x+z, proxy_formula=NUL
          temp.df <- copy(df)
          temp.df <- temp.df[,x:=x.obs]
          mod.caroll.lik <- measerr_mle(temp.df, outcome_formula=outcome_formula, proxy_formula=proxy_formula, truth_formula=truth_formula)
-        fisher.info <- solve(mod.caroll.lik$hessian)
-        coef <- mod.caroll.lik$par
-        ci.upper <- coef + sqrt(diag(fisher.info)) * 1.96
-        ci.lower <- coef - sqrt(diag(fisher.info)) * 1.96
-        
+
+    ## tryCatch({
+    ## mod.calibrated.mle <- mecor(y ~ MeasError(w_pred, reference = x.obs) + z, df, B=400, method='efficient')
+    ## (mod.calibrated.mle)
+    ## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',])
+    ## result <- append(result, list(
+    ##                              Bxy.est.mecor = mecor.ci['Estimate'],
+    ##                              Bxy.ci.upper.mecor = mecor.ci['UCI'],
+    ##                              Bxy.ci.lower.mecor = mecor.ci['LCI'])
+    ##                  )
+
+
+
+    fischer.info <- NA
+    ci.upper <- NA
+    ci.lower <- NA
+
+    tryCatch({fischer.info <- solve(mod.caroll.lik$hessian)
+        ci.upper <- coef + sqrt(diag(fischer.info)) * 1.96
+        ci.lower <- coef - sqrt(diag(fischer.info)) * 1.96
+    },
+
+    error=function(e) {result[['error']] <- as.character(e)
+    })
+
+    coef <- mod.caroll.lik$par
          
          result <- append(result,
                           list(Bxy.est.mle = coef['x'],
author	Nathan TeBlunthuis <nathante@uw.edu>
	Mon, 12 Dec 2022 06:46:30 +0000 (22:46 -0800)
committer	Nathan TeBlunthuis <nathante@uw.edu>
	Mon, 12 Dec 2022 06:46:30 +0000 (22:46 -0800)
simulations/01_two_covariates.R		patch \| blob \| history
simulations/02_indep_differential.R		patch \| blob \| history
simulations/03_depvar.R		patch \| blob \| history
simulations/03_depvar_differential.R		patch \| blob \| history
simulations/04_depvar_differential.R		patch \| blob \| history
simulations/Makefile		patch \| blob \| history
simulations/robustness_check_notes.md		patch \| blob \| history
simulations/simulation_base.R		patch \| blob \| history