From: Nathan TeBlunthuis Date: Mon, 12 Dec 2022 06:46:30 +0000 (-0800) Subject: Added, but didn't test the remaining robustness checks. X-Git-Url: https://code.communitydata.science/ml_measurement_error_public.git/commitdiff_plain/d8bc08f18f8c2128369ee959196e0e6080a11689?ds=inline;hp=8ac33c14d7e7874bf283aa9c252fa06566dc8b15 Added, but didn't test the remaining robustness checks. --- diff --git a/simulations/01_two_covariates.R b/simulations/01_two_covariates.R index b8f9317..cd688c7 100644 --- a/simulations/01_two_covariates.R +++ b/simulations/01_two_covariates.R @@ -30,11 +30,11 @@ source("simulation_base.R") #### how much power do we get from the model in the first place? (sweeping N and m) #### -simulate_data <- function(N, m, B0=0, Bxy=0.2, Bzy=-0.2, Bzx=0.2, y_explained_variance=0.025, prediction_accuracy=0.73, seed=1){ +simulate_data <- function(N, m, B0=0, Bxy=0.2, Bzy=-0.2, Bzx=0.2, y_explained_variance=0.025, prediction_accuracy=0.73, Px=0.5, seed=1){ set.seed(seed) z <- rnorm(N,sd=0.5) # x.var.epsilon <- var(Bzx *z) * ((1-zx_explained_variance)/zx_explained_variance) - xprime <- Bzx * z #+ x.var.epsilon + xprime <- Bzx * z + qlogis(Px) x <- rbinom(N,1,plogis(xprime)) y.var.epsilon <- (var(Bzy * z) + var(Bxy *x) + 2*cov(Bxy*x,Bzy*z)) * ((1-y_explained_variance)/y_explained_variance) @@ -78,16 +78,18 @@ parser <- add_argument(parser, "--truth_formula", help='formula for the true var parser <- add_argument(parser, "--Bzx", help='Effect of z on x', default=0.3) parser <- add_argument(parser, "--Bzy", help='Effect of z on y', default=-0.3) parser <- add_argument(parser, "--Bxy", help='Effect of x on y', default=0.3) +parser <- add_argument(parser, "--Px", help='Base rate of x', default=0.5) args <- parse_args(parser) B0 <- 0 +Px <- args$Px Bxy <- args$Bxy Bzy <- args$Bzy Bzx <- args$Bzx -df <- simulate_data(args$N, args$m, B0, Bxy, Bzy, Bzx, seed=args$seed + 500, y_explained_variance = args$y_explained_variance, prediction_accuracy=args$prediction_accuracy) +df <- simulate_data(args$N, args$m, B0, Bxy, Bzy, Bzx, Px, seed=args$seed + 500, y_explained_variance = args$y_explained_variance, prediction_accuracy=args$prediction_accuracy) -result <- list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy, Bzx=Bzx, 'Bzy'=Bzy, 'seed'=args$seed, 'y_explained_variance'=args$y_explained_variance, 'prediction_accuracy'=args$prediction_accuracy, 'accuracy_imbalance_difference'=args$accuracy_imbalance_difference, 'outcome_formula'=args$outcome_formula, 'proxy_formula'=args$proxy_formula,truth_formula=args$truth_formula, error='') +result <- list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy, Bzx=Bzx, 'Bzy'=Bzy, 'Px'=Px, 'seed'=args$seed, 'y_explained_variance'=args$y_explained_variance, 'prediction_accuracy'=args$prediction_accuracy, 'accuracy_imbalance_difference'=args$accuracy_imbalance_difference, 'outcome_formula'=args$outcome_formula, 'proxy_formula'=args$proxy_formula,truth_formula=args$truth_formula, error='') outline <- run_simulation(df, result, outcome_formula=as.formula(args$outcome_formula), proxy_formula=as.formula(args$proxy_formula), truth_formula=as.formula(args$truth_formula)) diff --git a/simulations/02_indep_differential.R b/simulations/02_indep_differential.R index 5d34312..80e19be 100644 --- a/simulations/02_indep_differential.R +++ b/simulations/02_indep_differential.R @@ -31,11 +31,11 @@ source("simulation_base.R") ## one way to do it is by adding correlation to x.obs and y that isn't in w. ## in other words, the model is missing an important feature of x.obs that's related to y. -simulate_data <- function(N, m, B0, Bxy, Bzx, Bzy, seed, y_explained_variance=0.025, prediction_accuracy=0.73, y_bias=-0.8,z_bias=0,accuracy_imbalance_difference=0.3){ +simulate_data <- function(N, m, B0, Bxy, Bzx, Bzy, seed, y_explained_variance=0.025, prediction_accuracy=0.73, y_bias=-0.8,z_bias=0,Px=0.5,accuracy_imbalance_difference=0.3){ set.seed(seed) # make w and y dependent z <- rnorm(N,sd=0.5) - x <- rbinom(N, 1, plogis(Bzx * z)) + x <- rbinom(N, 1, plogis(Bzx * z + qlogis(Px))) y.var.epsilon <- (var(Bzy * z) + var(Bxy *x) + 2*cov(Bzy*z,Bxy*x)) * ((1-y_explained_variance)/y_explained_variance) y.epsilon <- rnorm(N, sd = sqrt(y.var.epsilon)) @@ -140,10 +140,12 @@ parser <- add_argument(parser, "--proxy_formula", help='formula for the proxy va parser <- add_argument(parser, "--y_bias", help='coefficient of y on the probability a classification is correct', default=-0.5) parser <- add_argument(parser, "--z_bias", help='coefficient of z on the probability a classification is correct', default=0) parser <- add_argument(parser, "--truth_formula", help='formula for the true variable', default="x~z") +parser <- add_argument(parser, "--Px", help='base rate of x', default=0.5) args <- parse_args(parser) B0 <- 0 +Px <- args$Px Bxy <- args$Bxy Bzy <- args$Bzy Bzx <- args$Bzx @@ -157,7 +159,7 @@ if(args$m < args$N){ ## pc.df <- pc(suffStat=list(C=cor(df.pc),n=nrow(df.pc)),indepTest=gaussCItest,labels=names(df.pc),alpha=0.05) ## plot(pc.df) - result <- list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy, Bzx=args$Bzx, 'Bzy'=Bzy, 'seed'=args$seed, 'y_explained_variance'=args$y_explained_variance, 'prediction_accuracy'=args$prediction_accuracy, 'accuracy_imbalance_difference'=args$accuracy_imbalance_difference, 'y_bias'=args$y_bias,'outcome_formula'=args$outcome_formula, 'proxy_formula'=args$proxy_formula,truth_formula=args$truth_formula, error='') + result <- list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy, 'Bzx'=args$Bzx, 'Bzy'=Bzy, 'Px'=Px, .'seed'=args$seed, 'y_explained_variance'=args$y_explained_variance, 'prediction_accuracy'=args$prediction_accuracy, 'accuracy_imbalance_difference'=args$accuracy_imbalance_difference, 'y_bias'=args$y_bias,'outcome_formula'=args$outcome_formula, 'proxy_formula'=args$proxy_formula,truth_formula=args$truth_formula, error='') outline <- run_simulation(df, result, outcome_formula=as.formula(args$outcome_formula), proxy_formula=as.formula(args$proxy_formula), truth_formula=as.formula(args$truth_formula)) diff --git a/simulations/03_depvar.R b/simulations/03_depvar.R index dde1bee..f0064f2 100644 --- a/simulations/03_depvar.R +++ b/simulations/03_depvar.R @@ -76,12 +76,13 @@ parser <- add_argument(parser, "--prediction_accuracy", help='how accurate is th parser <- add_argument(parser, "--Bxy", help='coefficient of x on y', default=0.01) parser <- add_argument(parser, "--Bzy", help='coeffficient of z on y', default=-0.01) parser <- add_argument(parser, "--Bzx", help='coeffficient of z on x', default=-0.5) +parser <- add_argument(parser, "--B0", help='Base rate of y', default=0.5) parser <- add_argument(parser, "--outcome_formula", help='formula for the outcome variable', default="y~x+z") parser <- add_argument(parser, "--proxy_formula", help='formula for the proxy variable', default="w_pred~y") args <- parse_args(parser) -B0 <- 0 +B0 <- args$B0 Bxy <- args$Bxy Bzy <- args$Bzy Bzx <- args$Bzx diff --git a/simulations/03_depvar_differential.R b/simulations/03_depvar_differential.R index 7b920ba..02944a5 100644 --- a/simulations/03_depvar_differential.R +++ b/simulations/03_depvar_differential.R @@ -31,14 +31,14 @@ source("simulation_base.R") ## one way to do it is by adding correlation to x.obs and y that isn't in w. ## in other words, the model is missing an important feature of x.obs that's related to y. -simulate_data <- function(N, m, B0, Bxy, Bzy, seed, prediction_accuracy=0.73, x_bias=-0.75){ +simulate_data <- function(N, m, B0, Bxy, Bzy, Py, seed, prediction_accuracy=0.73, x_bias=-0.75){ set.seed(seed) # make w and y dependent z <- rbinom(N, 1, 0.5) x <- rbinom(N, 1, 0.5) - ystar <- Bzy * z + Bxy * x + B0 + ystar <- Bzy * z + Bxy * x + B0 + qlogix(Py) y <- rbinom(N,1,plogis(ystar)) # glm(y ~ x + z, family="binomial") @@ -77,6 +77,7 @@ parser <- add_argument(parser, "--prediction_accuracy", help='how accurate is th parser <- add_argument(parser, "--x_bias", help='how is the classifier biased?', default=0.75) parser <- add_argument(parser, "--Bxy", help='coefficient of x on y', default=0.3) parser <- add_argument(parser, "--Bzy", help='coeffficient of z on y', default=-0.3) +parser <- add_argument(parser, "--Py", help='Base rate of y', default=0.5) parser <- add_argument(parser, "--outcome_formula", help='formula for the outcome variable', default="y~x+z") parser <- add_argument(parser, "--proxy_formula", help='formula for the proxy variable', default="w_pred~y*x") diff --git a/simulations/04_depvar_differential.R b/simulations/04_depvar_differential.R index df0e825..b367080 100644 --- a/simulations/04_depvar_differential.R +++ b/simulations/04_depvar_differential.R @@ -76,12 +76,13 @@ parser <- add_argument(parser, "--prediction_accuracy", help='how accurate is th parser <- add_argument(parser, "--z_bias", help='how is the classifier biased?', default=1.5) parser <- add_argument(parser, "--Bxy", help='coefficient of x on y', default=0.1) parser <- add_argument(parser, "--Bzy", help='coeffficient of z on y', default=-0.1) +parser <- add_argument(parser, "--B0", help='coeffficient of z on y', default=-0.1) parser <- add_argument(parser, "--outcome_formula", help='formula for the outcome variable', default="y~x+z") parser <- add_argument(parser, "--proxy_formula", help='formula for the proxy variable', default="w_pred~y+z") args <- parse_args(parser) -B0 <- 0 +B0 <- args$B0 Bxy <- args$Bxy Bzy <- args$Bzy diff --git a/simulations/Makefile b/simulations/Makefile index e6a3bbe..feeeaa5 100644 --- a/simulations/Makefile +++ b/simulations/Makefile @@ -148,21 +148,204 @@ robustness_1_dv.feather: robustness_1_dv_jobs robustness_1_dv.RDS: robustness_1_dv.feather rm -f $@ ${srun} Rscript plot_dv_example.R --infile $< --name "robustness_1_dv" --outfile $@ - -robustness_2_jobs: grid_sweep.py 01_two_covariates.R simulation_base.R grid_sweep.py + +robustness_2_jobs_p1: grid_sweep.py 01_two_covariates.R simulation_base.R grid_sweep.py + rm -f $@ + ${srun} $< --command 'Rscript 01_two_covariates.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.60,0.65]}' --outfile $@ + +robustness_2_jobs_p2: grid_sweep.py 01_two_covariates.R simulation_base.R grid_sweep.py + rm -f $@ + ${srun} $< --command 'Rscript 01_two_covariates.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.70,0.75]}' --outfile $@ + +robustness_2_jobs_p3: grid_sweep.py 01_two_covariates.R simulation_base.R grid_sweep.py + rm -f $@ + ${srun} $< --command 'Rscript 01_two_covariates.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.80,0.85]}' --outfile $@ + +robustness_2_jobs_p4: grid_sweep.py 01_two_covariates.R simulation_base.R grid_sweep.py + rm -f $@ + ${srun} $< --command 'Rscript 01_two_covariates.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.90,0.95]}' --outfile $@ + +START=0 +END_1=$(shell cat robustness_2_jobs_p1 | wc -l) +END_2=$(shell cat robustness_2_jobs_p2 | wc -l) +END_3=$(shell cat robustness_2_jobs_p3 | wc -l) +END_4=$(shell cat robustness_2_jobs_p4 | wc -l) +STEP=1000 +ONE=1 +ITEMS_1=$(shell seq $(START) $(STEP) $(END_1)) +ITEMS_2=$(shell seq $(START) $(STEP) $(END_2)) +ITEMS_3=$(shell seq $(START) $(STEP) $(END_3)) +ITEMS_4=$(shell seq $(START) $(STEP) $(END_4)) + +robustness_2.feather: robustness_2_jobs_p1 robustness_2_jobs_p2 robustness_2_jobs_p3 robustness_2_jobs_p4 + $(foreach item,$(ITEMS_1),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_2_jobs_p1) + $(foreach item,$(ITEMS_2),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_2_jobs_p2;) + $(foreach item,$(ITEMS_3),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_2_jobs_p3;) + $(foreach item,$(ITEMS_4),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_2_jobs_p4;) + + +robustness_2_dv_jobs_p1: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py + rm -f $@ + ${srun} $< --command 'Rscript 03_depvar.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.60,0.65]}' --outfile $@ + +robustness_2_dv_jobs_p2: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py + rm -f $@ + ${srun} $< --command 'Rscript 03_depvar.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.70,0.75]}' --outfile $@ + +robustness_2_dv_jobs_p3: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py + rm -f $@ + ${srun} $< --command 'Rscript 03_depvar.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.80,0.85]}' --outfile $@ + +robustness_2_dv_jobs_p4: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py + rm -f $@ + ${srun} $< --command 'Rscript 01_two_covariates.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.90,0.95]}' --outfile $@ + +START=0 +END_1=$(shell cat robustness_2_dv_jobs_p1 | wc -l) +END_2=$(shell cat robustness_2_dv_jobs_p2 | wc -l) +END_3=$(shell cat robustness_2_dv_jobs_p3 | wc -l) +END_4=$(shell cat robustness_2_dv_jobs_p4 | wc -l) +STEP=1000 +ONE=1 +ITEMS_1=$(shell seq $(START) $(STEP) $(END_1)) +ITEMS_2=$(shell seq $(START) $(STEP) $(END_2)) +ITEMS_3=$(shell seq $(START) $(STEP) $(END_3)) +ITEMS_4=$(shell seq $(START) $(STEP) $(END_4)) + +robustness_2_dv.feather: robustness_2_dv_jobs_p1 robustness_2_dv_jobs_p2 robustness_2_dv_jobs_p3 robustness_2_dv_jobs_p4 + $(foreach item,$(ITEMS_1),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_2_dv_jobs_p1) + $(foreach item,$(ITEMS_2),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_2_dv_jobs_p2;) + $(foreach item,$(ITEMS_3),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_2_dv_jobs_p3;) + $(foreach item,$(ITEMS_4),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_2_dv_jobs_p4;) + + + +robustness_3_jobs_p1: grid_sweep.py 01_two_covariates.R simulation_base.R grid_sweep.py + rm -f $@ + ${srun} $< --command 'Rscript 01_two_covariates.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_3.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3],"Px":[0.5,0.6], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.85]}' --outfile $@ + +robustness_3_jobs_p2: grid_sweep.py 01_two_covariates.R simulation_base.R grid_sweep.py + rm -f $@ + ${srun} $< --command 'Rscript 01_two_covariates.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_3.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3],"Px":[0.7,0.8], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.85]}' --outfile $@ + +robustness_3_jobs_p3: grid_sweep.py 01_two_covariates.R simulation_base.R grid_sweep.py + rm -f $@ + ${srun} $< --command 'Rscript 01_two_covariates.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_3.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3],"Px":[0.9,0.95], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.85]}' --outfile $@ + +START=0 +END_1=$(shell cat robustness_3_jobs_p1 | wc -l) +END_2=$(shell cat robustness_3_jobs_p2 | wc -l) +END_3=$(shell cat robustness_3_jobs_p3 | wc -l) + +STEP=1000 +ONE=1 +ITEMS_1=$(shell seq $(START) $(STEP) $(END_1)) +ITEMS_2=$(shell seq $(START) $(STEP) $(END_2)) +ITEMS_3=$(shell seq $(START) $(STEP) $(END_3)) + +robustness_3.feather: robustness_3_jobs_p1 robustness_3_jobs_p2 robustness_3_jobs_p3 + $(foreach item,$(ITEMS_1),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_3_jobs_p1) + $(foreach item,$(ITEMS_2),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_3_jobs_p2;) + $(foreach item,$(ITEMS_3),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_3_jobs_p3;) + + +robustness_3_dv_jobs_p1: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py rm -f $@ - ${srun} $< --command 'Rscript 01_two_covariates.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.6,0.73,0.8,0.85,0.9,0.95]}' --outfile $@ + ${srun} $< --command 'Rscript 03_depvar.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_3.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3],"B0":[0.5,0.6], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.85]}' --outfile $@ +robustness_3_dv_jobs_p2: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py + rm -f $@ + ${srun} $< --command 'Rscript 03_depvar.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_3.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3],"B0":[0.7,0.8], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.85]}' --outfile $@ +robustness_3_dv_jobs_p3: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py + rm -f $@ + ${srun} $< --command 'Rscript 03_depvar.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_3.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "B0":[0.9,0.95], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.85]}' --outfile $@ + +START=0 +END_1=$(shell cat robustness_3_dv_jobs_p1 | wc -l) +END_2=$(shell cat robustness_3_dv_jobs_p2 | wc -l) +END_3=$(shell cat robustness_3_dv_jobs_p3 | wc -l) -START=1 -END=$(shell cat robustness_2_jobs | wc -l) STEP=1000 -ITEMS=$(shell seq $(START) $(STEP) $(END)) +ONE=1 +ITEMS_1=$(shell seq $(START) $(STEP) $(END_1)) +ITEMS_2=$(shell seq $(START) $(STEP) $(END_2)) +ITEMS_3=$(shell seq $(START) $(STEP) $(END_3)) + +robustness_3_dv.feather: robustness_3_dv_jobs_p1 robustness_3_dv_jobs_p2 robustness_3_dv_jobs_p3 + $(foreach item,$(ITEMS_1),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_3_dv_jobs_p1) + $(foreach item,$(ITEMS_2),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_3_dv_jobs_p2;) + $(foreach item,$(ITEMS_3),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_3_dv_jobs_p3;) + -robustness_2.feather: robustness_2_jobs - $(foreach item,$(ITEMS),sbatch --wait --verbose --array=$(shell expr $(item))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 $<) + +robustness_4_jobs_p1: grid_sweep.py 02_indep_differential.R simulation_base.R grid_sweep.py + rm -f $@ + ${srun} $< --command 'Rscript 02_indep_differential.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_4.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.85],y_bias=[-1,-0.85]}' --outfile $@ + +robustness_4_jobs_p2: grid_sweep.py 02_indep_differential.R simulation_base.R grid_sweep.py + rm -f $@ + ${srun} $< --command 'Rscript 02_indep_differential.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_4.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.85], y_bias=[-0.70,-0.55]}' --outfile $@ + +robustness_4_jobs_p3: grid_sweep.py 02_indep_differential.R simulation_base.R grid_sweep.py + rm -f $@ + ${srun} $< --command 'Rscript 02_indep_differential.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_4.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.85],y_bias=[-0.4,-0.25]}' --outfile $@ + +robustness_4_jobs_p4: grid_sweep.py 02_indep_differential.R simulation_base.R grid_sweep.py + rm -f $@ + ${srun} $< --command 'Rscript 02_indep_differential.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_4.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.85],y_bias=[-0.1,0]}' --outfile $@ + +START=0 +END_1=$(shell cat robustness_4_jobs_p1 | wc -l) +END_2=$(shell cat robustness_4_jobs_p2 | wc -l) +END_3=$(shell cat robustness_4_jobs_p3 | wc -l) +END_4=$(shell cat robustness_4_jobs_p3 | wc -l) + +STEP=1000 +ONE=1 +ITEMS_1=$(shell seq $(START) $(STEP) $(END_1)) +ITEMS_2=$(shell seq $(START) $(STEP) $(END_2)) +ITEMS_3=$(shell seq $(START) $(STEP) $(END_3)) +ITEMS_4=$(shell seq $(START) $(STEP) $(END_4)) + +robustness_4.feather: robustness_4_jobs_p1 robustness_4_jobs_p2 robustness_4_jobs_p3 + $(foreach item,$(ITEMS_1),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_4_jobs_p1) + $(foreach item,$(ITEMS_2),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_4_jobs_p2;) + $(foreach item,$(ITEMS_3),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_4_jobs_p3;) + + +robustness_4_dv_jobs_p1: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py + rm -f $@ + ${srun} $< --command 'Rscript 03_depvar.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_4.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3],"B0":[0.5] "outcome_formula":["y~x+z"], "prediction_accuracy":[0.85],z_bias=[0,0.1]}' --outfile $@ + +robustness_4_dv_jobs_p2: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py + rm -f $@ + ${srun} $< --command 'Rscript 03_depvar.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_4.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3],"B0":[0.5] "outcome_formula":["y~x+z"], "prediction_accuracy":[0.85],z_bias=[0.25,0.4]}' --outfile $@ + +robustness_4_dv_jobs_p3: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py + rm -f $@ + ${srun} $< --command 'Rscript 03_depvar.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_4.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "B0":[0.5], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.85],z_bias=[0.55,0.7]}' --outfile $@ +robustness_4_dv_jobs_p4: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py + rm -f $@ + ${srun} $< --command 'Rscript 03_depvar.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_4.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "B0":[0.5], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.85],z_bias=[0.85,1]}' --outfile $@ + + +START=0 +END_1=$(shell cat robustness_4_dv_jobs_p1 | wc -l) +END_2=$(shell cat robustness_4_dv_jobs_p2 | wc -l) +END_3=$(shell cat robustness_4_dv_jobs_p3 | wc -l) + +STEP=1000 +ONE=1 +ITEMS_1=$(shell seq $(START) $(STEP) $(END_1)) +ITEMS_2=$(shell seq $(START) $(STEP) $(END_2)) +ITEMS_3=$(shell seq $(START) $(STEP) $(END_3)) + +robustness_4_dv.feather: robustness_4_dv_jobs_p1 robustness_4_dv_jobs_p2 robustness_4_dv_jobs_p3 + $(foreach item,$(ITEMS_1),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_4_dv_jobs_p1) + $(foreach item,$(ITEMS_2),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_4_dv_jobs_p2;) + $(foreach item,$(ITEMS_3),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_4_dv_jobs_p3;) # clean: diff --git a/simulations/robustness_check_notes.md b/simulations/robustness_check_notes.md index 0a287a7..1c786e9 100644 --- a/simulations/robustness_check_notes.md +++ b/simulations/robustness_check_notes.md @@ -12,3 +12,29 @@ Like `robustness\_1.RDS` but with a less precise model for $w_pred$. In the mai This is just example 1 with varying levels of classifier accuracy. +# robustness_2_dv.RDS + +Example 3 with varying levels of classifier accuracy + +# robustness_3.RDS + +Example 1 with varying levels of skewness in the classified variable. The variable `Px` is the baserate of $X$ and controls the skewness of $X$. +It probably makes more sense to report the mean of $X$ instead of `Px` in the supplement. + +# robustness_3_dv.RDS + +Example 3 with varying levels of skewness in the classified variable. The variable `B0` is the intercept of the main model and controls the skewness of $Y$. +It probably makes more sense to report the mean of $Y$ instead of B0 in the supplement. + +# robustness_4.RDS + +Example 2 with varying amounts of differential error. The variable `y_bias` controls the amount of differential error. +It probably makes more sense to report the corrleation between $Y$ and $X-~$, or the difference in accuracy from when when $Y=1$ to $Y=0$ in the supplement instead of `y_bias`. + +# robustness_4_dv.RDS + +Example 4 with varying amounts of bias. The variable `z_bias` controls the amount of differential error. +It probably makes more sense to report the corrleation between $Z$ and $Y-W$, or the difference in accuracy from when when $Z=1$ to $Z=0$ in the supplement instead of `z_bias`. + + + diff --git a/simulations/simulation_base.R b/simulations/simulation_base.R index 82b17a7..08b11ec 100644 --- a/simulations/simulation_base.R +++ b/simulations/simulation_base.R @@ -151,10 +151,10 @@ run_simulation_depvar <- function(df, result, outcome_formula=y~x+z, proxy_formu temp.df <- copy(df) temp.df[,y:=y.obs] mod.caroll.lik <- measerr_mle_dv(temp.df, outcome_formula=outcome_formula, proxy_formula=proxy_formula) - fisher.info <- solve(mod.caroll.lik$hessian) + fischer.info <- solve(mod.caroll.lik$hessian) coef <- mod.caroll.lik$par - ci.upper <- coef + sqrt(diag(fisher.info)) * 1.96 - ci.lower <- coef - sqrt(diag(fisher.info)) * 1.96 + ci.upper <- coef + sqrt(diag(fischer.info)) * 1.96 + ci.lower <- coef - sqrt(diag(fischer.info)) * 1.96 result <- append(result, list(Bxy.est.mle = coef['x'], Bxy.ci.upper.mle = ci.upper['x'], @@ -299,11 +299,32 @@ run_simulation <- function(df, result, outcome_formula=y~x+z, proxy_formula=NUL temp.df <- copy(df) temp.df <- temp.df[,x:=x.obs] mod.caroll.lik <- measerr_mle(temp.df, outcome_formula=outcome_formula, proxy_formula=proxy_formula, truth_formula=truth_formula) - fisher.info <- solve(mod.caroll.lik$hessian) - coef <- mod.caroll.lik$par - ci.upper <- coef + sqrt(diag(fisher.info)) * 1.96 - ci.lower <- coef - sqrt(diag(fisher.info)) * 1.96 - + + ## tryCatch({ + ## mod.calibrated.mle <- mecor(y ~ MeasError(w_pred, reference = x.obs) + z, df, B=400, method='efficient') + ## (mod.calibrated.mle) + ## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',]) + ## result <- append(result, list( + ## Bxy.est.mecor = mecor.ci['Estimate'], + ## Bxy.ci.upper.mecor = mecor.ci['UCI'], + ## Bxy.ci.lower.mecor = mecor.ci['LCI']) + ## ) + + + + fischer.info <- NA + ci.upper <- NA + ci.lower <- NA + + tryCatch({fischer.info <- solve(mod.caroll.lik$hessian) + ci.upper <- coef + sqrt(diag(fischer.info)) * 1.96 + ci.lower <- coef - sqrt(diag(fischer.info)) * 1.96 + }, + + error=function(e) {result[['error']] <- as.character(e) + }) + + coef <- mod.caroll.lik$par result <- append(result, list(Bxy.est.mle = coef['x'],