From: Nathan TeBlunthuis Date: Sat, 11 Feb 2023 20:26:48 +0000 (-0800) Subject: Make summarize estimator group correctly for robustness checks. X-Git-Url: https://code.communitydata.science/ml_measurement_error_public.git/commitdiff_plain/b8d2048cc5338fbd872b55029c3e5d01c739a397?hp=-c Make summarize estimator group correctly for robustness checks. Also fix a possible bug in the MI logic and simplify the error correction formula in example 2. --- b8d2048cc5338fbd872b55029c3e5d01c739a397 diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..4efbae7 --- /dev/null +++ b/Makefile @@ -0,0 +1,3 @@ +all: + +$(MAKE) -C simulations + +$(MAKE) -C civil_comments diff --git a/simulations/Makefile b/simulations/Makefile index 1cab473..3e8fdd5 100644 --- a/simulations/Makefile +++ b/simulations/Makefile @@ -36,7 +36,7 @@ example_1.feather: example_1_jobs sbatch --wait --verbose --array=4001-$(shell cat example_1_jobs | wc -l) run_simulation.sbatch 0 example_1_jobs example_2_jobs: 02_indep_differential.R simulation_base.R grid_sweep.py pl_methods.R - sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 02_indep_differential.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_2.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[1], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y*z*x"]}' --outfile example_2_jobs + sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 02_indep_differential.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_2.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[1], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+z+x"]}' --outfile example_2_jobs example_2.feather: example_2_jobs rm -f example_2.feather @@ -66,10 +66,10 @@ example_3.feather: example_3_jobs sbatch --wait --verbose --array=4001-$(shell cat example_3_jobs | wc -l) run_simulation.sbatch 0 example_3_jobs example_4_jobs: 04_depvar_differential.R simulation_base.R grid_sweep.py pl_methods.R - sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 04_depvar_differential.R" --arg_dict '{"N":${Ns},"Bxy":[0.7],"Bzy":[-0.7],"m":${ms}, "seed":${seeds}, "outfile":["example_4.feather"], "z_bias":[0.5]}' --outfile example_4_jobs + sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 04_depvar_differential.R" --arg_dict '{"N":${Ns},"Bxy":[0.7],"Bzy":[-0.7],"m":${ms}, "seed":${seeds}, "outfile":["example_4.feather"], "z_bias":[0.3], "prediction_accuracy":[0.73]}' --outfile example_4_jobs example_4.feather: example_4_jobs - rm -f example_4.feather + rm -f example_4.feather sbatch --wait --verbose --array=1-1000 run_simulation.sbatch 0 example_4_jobs sbatch --wait --verbose --array=1001-2000 run_simulation.sbatch 0 example_4_jobs sbatch --wait --verbose --array=2001-3000 run_simulation.sbatch 0 example_4_jobs @@ -86,41 +86,6 @@ remembr.RDS:example_1.feather example_2.feather example_3.feather example_4.feat ${srun} Rscript plot_dv_example.R --infile example_4.feather --name "plot.df.example.4" -irr_Ns = [1000] -irr_ms = [150,300,600] -irr_seeds=${seeds} -irr_explained_variances=${explained_variances} -irr_coder_accuracy=[0.80] - -example_5_jobs: 05_irr_indep.R irr_simulation_base.R grid_sweep.py pl_methods.R measerr_methods.R - sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 05_irr_indep.R" --arg_dict '{"N":${irr_Ns},"m":${irr_ms}, "seed":${irr_seeds}, "outfile":["example_5.feather"], "y_explained_variance":${irr_explained_variances}, "coder_accuracy":${irr_coder_accuracy}}' --outfile example_5_jobs - -example_5.feather:example_5_jobs - rm -f example_5.feather - sbatch --wait --verbose --array=1-1000 run_simulation.sbatch 0 example_5_jobs - sbatch --wait --verbose --array=1001-$(shell cat example_5_jobs | wc -l) run_simulation.sbatch 1000 example_5_jobs - # sbatch --wait --verbose --array=2001-3000 run_simulation.sbatch 2000 example_5_jobs - # sbatch --wait --verbose --array=3001-4000 run_simulation.sbatch 3000 example_5_jobs - # sbatch --wait --verbose --array=2001-$(shell cat example_5_jobs | wc -l) run_simulation.sbatch 4000 example_5_jobs - - - -# example_6_jobs: 06_irr_dv.R irr_dv_simulation_base.R grid_sweep.py pl_methods.R -# sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 06_irr_dv.R" --arg_dict '{"N":${irr_Ns},"m":${irr_ms}, "seed":${irr_seeds}, "outfile":["example_6.feather"], "y_explained_variance":${irr_explained_variances},"coder_accuracy":${irr_coder_accuracy}}' --outfile example_6_jobs - -# example_6.feather:example_6_jobs -# rm -f example_6.feather -# sbatch --wait --verbose --array=1-1000 run_simulation.sbatch 0 example_6_jobs -# sbatch --wait --verbose --array=1001-2000 run_simulation.sbatch 1000 example_6_jobs -# sbatch --wait --verbose --array=2001-$(shell cat example_6_jobs | wc -l) run_simulation.sbatch 2000 example_6_jobs - - -remember_irr.RDS: example_5.feather plot_irr_example.R plot_irr_dv_example.R summarize_estimator.R - rm -f remember_irr.RDS - sbatch --wait --verbose run_job.sbatch Rscript plot_irr_example.R --infile example_5.feather --name "plot.df.example.5" -# sbatch --wait --verbose run_job.sbatch Rscript plot_irr_dv_example.R --infile example_6.feather --name "plot.df.example.6" - - robustness_1_jobs: 02_indep_differential.R simulation_base.R grid_sweep.py sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 02_indep_differential.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_1.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~1"]}' --outfile robustness_1_jobs @@ -210,7 +175,7 @@ robustness_2_dv_jobs_p3: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep. robustness_2_dv_jobs_p4: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py rm -f $@ - ${srun} $< --command 'Rscript 01_two_covariates.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2_dv.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.90,0.95]}' --outfile $@ + ${srun} $< --command 'Rscript 03_depvar.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2_dv.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.90,0.95]}' --outfile $@ robustness_2_dv.feather: robustness_2_dv_jobs_p1 robustness_2_dv_jobs_p2 robustness_2_dv_jobs_p3 robustness_2_dv_jobs_p4 $(eval END_1!=cat robustness_2_dv_jobs_p1 | wc -l) @@ -361,8 +326,22 @@ robustness_4_dv.RDS: plot_dv_example.R robustness_4_dv.feather rm -f $@ ${srun} Rscript $< --infile $(word 2, $^) --name "robustness_4" --remember-file $@ + + +clean_main: + rm -f remembr.RDS + rm -f example_1_jobs + rm -f example_2_jobs + rm -f example_3_jobs + rm -f example_4_jobs + rm -f example_1.feather + rm -f example_2.feather + rm -f example_3.feather + rm -f example_4.feather + + # -clean: +clean_all: rm *.feather rm -f remembr.RDS rm -f remembr*.RDS diff --git a/simulations/robustness_check_notes.md b/simulations/robustness_check_notes.md index 64a472d..ac7e88f 100644 --- a/simulations/robustness_check_notes.md +++ b/simulations/robustness_check_notes.md @@ -2,11 +2,10 @@ Tests how robust the MLE method for independent variables with differential error is when the model for $X$ is less precise. In the main paper, we include $Z$ on the right-hand-side of the `truth_formula`. In this robustness check, the `truth_formula` is an intercept-only model. -The stats are in the list named `robustness_1` in the `.RDS` file. - +The stats are in the list named `robustness_1` in the `.RDS` # robustness\_1\_dv.RDS -Like `robustness\_1.RDS` but with a less precise model for $w_pred$. In the main paper, we included $Z$ in the `outcome_formula`. In this robustness check, we do not. +Like `robustness\_1.RDS` but with a less precise model for $w_pred$. In the main paper, we included $Z$ in the `proxy_formula`. In this robustness check, we do not. # robustness_2.RDS diff --git a/simulations/simulation_base.R b/simulations/simulation_base.R index e715edf..73544e9 100644 --- a/simulations/simulation_base.R +++ b/simulations/simulation_base.R @@ -280,79 +280,83 @@ run_simulation <- function(df, result, outcome_formula=y~x+z, proxy_formula=NUL Bxy.ci.lower.naive = naive.ci.Bxy[1], Bzy.ci.upper.naive = naive.ci.Bzy[2], Bzy.ci.lower.naive = naive.ci.Bzy[1])) - + amelia_result <- list( + Bxy.est.amelia.full = NULL, + Bxy.ci.upper.amelia.full = NULL, + Bxy.ci.lower.amelia.full = NULL, + Bzy.est.amelia.full = NULL, + Bzy.ci.upper.amelia.full = NULL, + Bzy.ci.lower.amelia.full = NULL + ) - amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w')) - mod.amelia.k <- zelig(y~x.obs+z, model='ls', data=amelia.out.k$imputations, cite=FALSE) - (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE)) + tryCatch({ + amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w')) + mod.amelia.k <- zelig(y~x.obs+z, model='ls', data=amelia.out.k$imputations, cite=FALSE) + (coefse <- combine_coef_se(mod.amelia.k)) - est.x.mi <- coefse['x.obs','Estimate'] - est.x.se <- coefse['x.obs','Std.Error'] - result <- append(result, - list(Bxy.est.amelia.full = est.x.mi, - Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se, - Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se - )) + est.x.mi <- coefse['x.obs','Estimate'] + est.x.se <- coefse['x.obs','Std.Error'] + est.z.mi <- coefse['z','Estimate'] + est.z.se <- coefse['z','Std.Error'] - est.z.mi <- coefse['z','Estimate'] - est.z.se <- coefse['z','Std.Error'] + amelia_result <- list(Bxy.est.amelia.full = est.x.mi, + Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se, + Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se, + Bzy.est.amelia.full = est.z.mi, + Bzy.ci.upper.amelia.full = est.z.mi + 1.96 * est.z.se, + Bzy.ci.lower.amelia.full = est.z.mi - 1.96 * est.z.se + ) - result <- append(result, - list(Bzy.est.amelia.full = est.z.mi, - Bzy.ci.upper.amelia.full = est.z.mi + 1.96 * est.z.se, - Bzy.ci.lower.amelia.full = est.z.mi - 1.96 * est.z.se - )) + }, + error = function(e){ + result[['error']] <- e} + ) - temp.df <- copy(df) - temp.df <- temp.df[,x:=x.obs] - mod.caroll.lik <- measerr_mle(temp.df, outcome_formula=outcome_formula, proxy_formula=proxy_formula, truth_formula=truth_formula) - - ## tryCatch({ - ## mod.calibrated.mle <- mecor(y ~ MeasError(w_pred, reference = x.obs) + z, df, B=400, method='efficient') - ## (mod.calibrated.mle) - ## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',]) - ## result <- append(result, list( - ## Bxy.est.mecor = mecor.ci['Estimate'], - ## Bxy.ci.upper.mecor = mecor.ci['UCI'], - ## Bxy.ci.lower.mecor = mecor.ci['LCI']) - ## ) + result <- append(result, amelia_result) - fischer.info <- NA - ci.upper <- NA - ci.lower <- NA + mle_result <- list(Bxy.est.mle = NULL, + Bxy.ci.upper.mle = NULL, + Bxy.ci.lower.mle = NULL, + Bzy.est.mle = NULL, + Bzy.ci.upper.mle = NULL, + Bzy.ci.lower.mle = NULL) - tryCatch({fischer.info <- solve(mod.caroll.lik$hessian) + tryCatch({ + temp.df <- copy(df) + temp.df <- temp.df[,x:=x.obs] + mod.caroll.lik <- measerr_mle(temp.df, outcome_formula=outcome_formula, proxy_formula=proxy_formula, truth_formula=truth_formula) + fischer.info <- solve(mod.caroll.lik$hessian) + coef <- mod.caroll.lik$par ci.upper <- coef + sqrt(diag(fischer.info)) * 1.96 ci.lower <- coef - sqrt(diag(fischer.info)) * 1.96 + mle_result <- list(Bxy.est.mle = coef['x'], + Bxy.ci.upper.mle = ci.upper['x'], + Bxy.ci.lower.mle = ci.lower['x'], + Bzy.est.mle = coef['z'], + Bzy.ci.upper.mle = ci.upper['z'], + Bzy.ci.lower.mle = ci.lower['z']) }, error=function(e) {result[['error']] <- as.character(e) }) - coef <- mod.caroll.lik$par - result <- append(result, - list(Bxy.est.mle = coef['x'], - Bxy.ci.upper.mle = ci.upper['x'], - Bxy.ci.lower.mle = ci.lower['x'], - Bzy.est.mle = coef['z'], - Bzy.ci.upper.mle = ci.upper['z'], - Bzy.ci.lower.mle = ci.lower['z'])) - - mod.zhang.lik <- zhang.mle.iv(df) - coef <- coef(mod.zhang.lik) - ci <- confint(mod.zhang.lik,method='quad') - result <- append(result, - list(Bxy.est.zhang = coef['Bxy'], - Bxy.ci.upper.zhang = ci['Bxy','97.5 %'], - Bxy.ci.lower.zhang = ci['Bxy','2.5 %'], - Bzy.est.zhang = coef['Bzy'], - Bzy.ci.upper.zhang = ci['Bzy','97.5 %'], - Bzy.ci.lower.zhang = ci['Bzy','2.5 %'])) + result <- append(result, mle_result) + + mod.zhang.lik <- zhang.mle.iv(df) + coef <- coef(mod.zhang.lik) + ci <- confint(mod.zhang.lik,method='quad') + result <- append(result, + list(Bxy.est.zhang = coef['Bxy'], + Bxy.ci.upper.zhang = ci['Bxy','97.5 %'], + Bxy.ci.lower.zhang = ci['Bxy','2.5 %'], + Bzy.est.zhang = coef['Bzy'], + Bzy.ci.upper.zhang = ci['Bzy','97.5 %'], + Bzy.ci.lower.zhang = ci['Bzy','2.5 %'])) ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model. ## amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","w_pred"), noms=noms) diff --git a/simulations/summarize_estimator.R b/simulations/summarize_estimator.R index 3e4209f..f416c5b 100644 --- a/simulations/summarize_estimator.R +++ b/simulations/summarize_estimator.R @@ -1,17 +1,21 @@ summarize.estimator <- function(df, suffix='naive', coefname='x'){ - part <- df[,c('N', - 'm', - 'Bxy', - paste0('B',coefname,'y.est.',suffix), - paste0('B',coefname,'y.ci.lower.',suffix), - paste0('B',coefname,'y.ci.upper.',suffix), - 'y_explained_variance', - 'Bzx', - 'Bzy', - 'accuracy_imbalance_difference' - ), + reported_vars <- c( + 'Bxy', + paste0('B',coefname,'y.est.',suffix), + paste0('B',coefname,'y.ci.lower.',suffix), + paste0('B',coefname,'y.ci.upper.',suffix) + ) + + + grouping_vars <- c('N','m','B0', 'Bxy', 'Bzy', 'Bzx', 'Px', 'y_explained_variance', 'prediction_accuracy','outcome_formula','proxy_formula','truth_formula','z_bias','y_bias') + + grouping_vars <- grouping_vars[grouping_vars %in% names(df)] + + part <- df[, + c(reported_vars, + grouping_vars), with=FALSE] @@ -27,8 +31,8 @@ summarize.estimator <- function(df, suffix='naive', coefname='x'){ part.plot <- part[, .(p.true.in.ci = mean(true.in.ci), mean.bias = mean(bias), - mean.est = mean(.SD[[paste0('B',coefname,'y.est.',suffix)]]), - var.est = var(.SD[[paste0('B',coefname,'y.est.',suffix)]]), + mean.est = mean(.SD[[paste0('B',coefname,'y.est.',suffix)]],na.rm=T), + var.est = var(.SD[[paste0('B',coefname,'y.est.',suffix)]],na.rm=T), est.upper.95 = quantile(.SD[[paste0('B',coefname,'y.est.',suffix)]],0.975,na.rm=T), est.lower.95 = quantile(.SD[[paste0('B',coefname,'y.est.',suffix)]],0.025,na.rm=T), mean.ci.upper = mean(.SD[[paste0('B',coefname,'y.ci.upper.',suffix)]],na.rm=T), @@ -43,7 +47,7 @@ summarize.estimator <- function(df, suffix='naive', coefname='x'){ variable=coefname, method=suffix ), - by=c("N","m",'y_explained_variance','Bzx', 'Bzy', 'accuracy_imbalance_difference') + by=grouping_vars, ] return(part.plot)