--- /dev/null
+all:
+ +$(MAKE) -C simulations
+ +$(MAKE) -C civil_comments
sbatch --wait --verbose --array=4001-$(shell cat example_1_jobs | wc -l) run_simulation.sbatch 0 example_1_jobs
example_2_jobs: 02_indep_differential.R simulation_base.R grid_sweep.py pl_methods.R
- sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 02_indep_differential.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_2.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[1], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y*z*x"]}' --outfile example_2_jobs
+ sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 02_indep_differential.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_2.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[1], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+z+x"]}' --outfile example_2_jobs
example_2.feather: example_2_jobs
rm -f example_2.feather
sbatch --wait --verbose --array=4001-$(shell cat example_3_jobs | wc -l) run_simulation.sbatch 0 example_3_jobs
example_4_jobs: 04_depvar_differential.R simulation_base.R grid_sweep.py pl_methods.R
- sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 04_depvar_differential.R" --arg_dict '{"N":${Ns},"Bxy":[0.7],"Bzy":[-0.7],"m":${ms}, "seed":${seeds}, "outfile":["example_4.feather"], "z_bias":[0.5]}' --outfile example_4_jobs
+ sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 04_depvar_differential.R" --arg_dict '{"N":${Ns},"Bxy":[0.7],"Bzy":[-0.7],"m":${ms}, "seed":${seeds}, "outfile":["example_4.feather"], "z_bias":[0.3], "prediction_accuracy":[0.73]}' --outfile example_4_jobs
example_4.feather: example_4_jobs
- rm -f example_4.feather
+ rm -f example_4.feather
sbatch --wait --verbose --array=1-1000 run_simulation.sbatch 0 example_4_jobs
sbatch --wait --verbose --array=1001-2000 run_simulation.sbatch 0 example_4_jobs
sbatch --wait --verbose --array=2001-3000 run_simulation.sbatch 0 example_4_jobs
${srun} Rscript plot_dv_example.R --infile example_4.feather --name "plot.df.example.4"
-irr_Ns = [1000]
-irr_ms = [150,300,600]
-irr_seeds=${seeds}
-irr_explained_variances=${explained_variances}
-irr_coder_accuracy=[0.80]
-
-example_5_jobs: 05_irr_indep.R irr_simulation_base.R grid_sweep.py pl_methods.R measerr_methods.R
- sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 05_irr_indep.R" --arg_dict '{"N":${irr_Ns},"m":${irr_ms}, "seed":${irr_seeds}, "outfile":["example_5.feather"], "y_explained_variance":${irr_explained_variances}, "coder_accuracy":${irr_coder_accuracy}}' --outfile example_5_jobs
-
-example_5.feather:example_5_jobs
- rm -f example_5.feather
- sbatch --wait --verbose --array=1-1000 run_simulation.sbatch 0 example_5_jobs
- sbatch --wait --verbose --array=1001-$(shell cat example_5_jobs | wc -l) run_simulation.sbatch 1000 example_5_jobs
- # sbatch --wait --verbose --array=2001-3000 run_simulation.sbatch 2000 example_5_jobs
- # sbatch --wait --verbose --array=3001-4000 run_simulation.sbatch 3000 example_5_jobs
- # sbatch --wait --verbose --array=2001-$(shell cat example_5_jobs | wc -l) run_simulation.sbatch 4000 example_5_jobs
-
-
-
-# example_6_jobs: 06_irr_dv.R irr_dv_simulation_base.R grid_sweep.py pl_methods.R
-# sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 06_irr_dv.R" --arg_dict '{"N":${irr_Ns},"m":${irr_ms}, "seed":${irr_seeds}, "outfile":["example_6.feather"], "y_explained_variance":${irr_explained_variances},"coder_accuracy":${irr_coder_accuracy}}' --outfile example_6_jobs
-
-# example_6.feather:example_6_jobs
-# rm -f example_6.feather
-# sbatch --wait --verbose --array=1-1000 run_simulation.sbatch 0 example_6_jobs
-# sbatch --wait --verbose --array=1001-2000 run_simulation.sbatch 1000 example_6_jobs
-# sbatch --wait --verbose --array=2001-$(shell cat example_6_jobs | wc -l) run_simulation.sbatch 2000 example_6_jobs
-
-
-remember_irr.RDS: example_5.feather plot_irr_example.R plot_irr_dv_example.R summarize_estimator.R
- rm -f remember_irr.RDS
- sbatch --wait --verbose run_job.sbatch Rscript plot_irr_example.R --infile example_5.feather --name "plot.df.example.5"
-# sbatch --wait --verbose run_job.sbatch Rscript plot_irr_dv_example.R --infile example_6.feather --name "plot.df.example.6"
-
-
robustness_1_jobs: 02_indep_differential.R simulation_base.R grid_sweep.py
sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 02_indep_differential.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_1.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~1"]}' --outfile robustness_1_jobs
robustness_2_dv_jobs_p4: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py
rm -f $@
- ${srun} $< --command 'Rscript 01_two_covariates.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2_dv.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.90,0.95]}' --outfile $@
+ ${srun} $< --command 'Rscript 03_depvar.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2_dv.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.90,0.95]}' --outfile $@
robustness_2_dv.feather: robustness_2_dv_jobs_p1 robustness_2_dv_jobs_p2 robustness_2_dv_jobs_p3 robustness_2_dv_jobs_p4
$(eval END_1!=cat robustness_2_dv_jobs_p1 | wc -l)
rm -f $@
${srun} Rscript $< --infile $(word 2, $^) --name "robustness_4" --remember-file $@
+
+
+clean_main:
+ rm -f remembr.RDS
+ rm -f example_1_jobs
+ rm -f example_2_jobs
+ rm -f example_3_jobs
+ rm -f example_4_jobs
+ rm -f example_1.feather
+ rm -f example_2.feather
+ rm -f example_3.feather
+ rm -f example_4.feather
+
+
#
-clean:
+clean_all:
rm *.feather
rm -f remembr.RDS
rm -f remembr*.RDS
Tests how robust the MLE method for independent variables with differential error is when the model for $X$ is less precise. In the main paper, we include $Z$ on the right-hand-side of the `truth_formula`.
In this robustness check, the `truth_formula` is an intercept-only model.
-The stats are in the list named `robustness_1` in the `.RDS` file.
-
+The stats are in the list named `robustness_1` in the `.RDS`
# robustness\_1\_dv.RDS
-Like `robustness\_1.RDS` but with a less precise model for $w_pred$. In the main paper, we included $Z$ in the `outcome_formula`. In this robustness check, we do not.
+Like `robustness\_1.RDS` but with a less precise model for $w_pred$. In the main paper, we included $Z$ in the `proxy_formula`. In this robustness check, we do not.
# robustness_2.RDS
Bxy.ci.lower.naive = naive.ci.Bxy[1],
Bzy.ci.upper.naive = naive.ci.Bzy[2],
Bzy.ci.lower.naive = naive.ci.Bzy[1]))
-
+ amelia_result <- list(
+ Bxy.est.amelia.full = NULL,
+ Bxy.ci.upper.amelia.full = NULL,
+ Bxy.ci.lower.amelia.full = NULL,
+ Bzy.est.amelia.full = NULL,
+ Bzy.ci.upper.amelia.full = NULL,
+ Bzy.ci.lower.amelia.full = NULL
+ )
- amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w'))
- mod.amelia.k <- zelig(y~x.obs+z, model='ls', data=amelia.out.k$imputations, cite=FALSE)
- (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE))
+ tryCatch({
+ amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w'))
+ mod.amelia.k <- zelig(y~x.obs+z, model='ls', data=amelia.out.k$imputations, cite=FALSE)
+ (coefse <- combine_coef_se(mod.amelia.k))
- est.x.mi <- coefse['x.obs','Estimate']
- est.x.se <- coefse['x.obs','Std.Error']
- result <- append(result,
- list(Bxy.est.amelia.full = est.x.mi,
- Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
- Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
- ))
+ est.x.mi <- coefse['x.obs','Estimate']
+ est.x.se <- coefse['x.obs','Std.Error']
+ est.z.mi <- coefse['z','Estimate']
+ est.z.se <- coefse['z','Std.Error']
- est.z.mi <- coefse['z','Estimate']
- est.z.se <- coefse['z','Std.Error']
+ amelia_result <- list(Bxy.est.amelia.full = est.x.mi,
+ Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
+ Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se,
+ Bzy.est.amelia.full = est.z.mi,
+ Bzy.ci.upper.amelia.full = est.z.mi + 1.96 * est.z.se,
+ Bzy.ci.lower.amelia.full = est.z.mi - 1.96 * est.z.se
+ )
- result <- append(result,
- list(Bzy.est.amelia.full = est.z.mi,
- Bzy.ci.upper.amelia.full = est.z.mi + 1.96 * est.z.se,
- Bzy.ci.lower.amelia.full = est.z.mi - 1.96 * est.z.se
- ))
+ },
+ error = function(e){
+ result[['error']] <- e}
+ )
- temp.df <- copy(df)
- temp.df <- temp.df[,x:=x.obs]
- mod.caroll.lik <- measerr_mle(temp.df, outcome_formula=outcome_formula, proxy_formula=proxy_formula, truth_formula=truth_formula)
-
- ## tryCatch({
- ## mod.calibrated.mle <- mecor(y ~ MeasError(w_pred, reference = x.obs) + z, df, B=400, method='efficient')
- ## (mod.calibrated.mle)
- ## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',])
- ## result <- append(result, list(
- ## Bxy.est.mecor = mecor.ci['Estimate'],
- ## Bxy.ci.upper.mecor = mecor.ci['UCI'],
- ## Bxy.ci.lower.mecor = mecor.ci['LCI'])
- ## )
+ result <- append(result, amelia_result)
- fischer.info <- NA
- ci.upper <- NA
- ci.lower <- NA
+ mle_result <- list(Bxy.est.mle = NULL,
+ Bxy.ci.upper.mle = NULL,
+ Bxy.ci.lower.mle = NULL,
+ Bzy.est.mle = NULL,
+ Bzy.ci.upper.mle = NULL,
+ Bzy.ci.lower.mle = NULL)
- tryCatch({fischer.info <- solve(mod.caroll.lik$hessian)
+ tryCatch({
+ temp.df <- copy(df)
+ temp.df <- temp.df[,x:=x.obs]
+ mod.caroll.lik <- measerr_mle(temp.df, outcome_formula=outcome_formula, proxy_formula=proxy_formula, truth_formula=truth_formula)
+ fischer.info <- solve(mod.caroll.lik$hessian)
+ coef <- mod.caroll.lik$par
ci.upper <- coef + sqrt(diag(fischer.info)) * 1.96
ci.lower <- coef - sqrt(diag(fischer.info)) * 1.96
+ mle_result <- list(Bxy.est.mle = coef['x'],
+ Bxy.ci.upper.mle = ci.upper['x'],
+ Bxy.ci.lower.mle = ci.lower['x'],
+ Bzy.est.mle = coef['z'],
+ Bzy.ci.upper.mle = ci.upper['z'],
+ Bzy.ci.lower.mle = ci.lower['z'])
},
error=function(e) {result[['error']] <- as.character(e)
})
- coef <- mod.caroll.lik$par
- result <- append(result,
- list(Bxy.est.mle = coef['x'],
- Bxy.ci.upper.mle = ci.upper['x'],
- Bxy.ci.lower.mle = ci.lower['x'],
- Bzy.est.mle = coef['z'],
- Bzy.ci.upper.mle = ci.upper['z'],
- Bzy.ci.lower.mle = ci.lower['z']))
-
- mod.zhang.lik <- zhang.mle.iv(df)
- coef <- coef(mod.zhang.lik)
- ci <- confint(mod.zhang.lik,method='quad')
- result <- append(result,
- list(Bxy.est.zhang = coef['Bxy'],
- Bxy.ci.upper.zhang = ci['Bxy','97.5 %'],
- Bxy.ci.lower.zhang = ci['Bxy','2.5 %'],
- Bzy.est.zhang = coef['Bzy'],
- Bzy.ci.upper.zhang = ci['Bzy','97.5 %'],
- Bzy.ci.lower.zhang = ci['Bzy','2.5 %']))
+ result <- append(result, mle_result)
+
+ mod.zhang.lik <- zhang.mle.iv(df)
+ coef <- coef(mod.zhang.lik)
+ ci <- confint(mod.zhang.lik,method='quad')
+ result <- append(result,
+ list(Bxy.est.zhang = coef['Bxy'],
+ Bxy.ci.upper.zhang = ci['Bxy','97.5 %'],
+ Bxy.ci.lower.zhang = ci['Bxy','2.5 %'],
+ Bzy.est.zhang = coef['Bzy'],
+ Bzy.ci.upper.zhang = ci['Bzy','97.5 %'],
+ Bzy.ci.lower.zhang = ci['Bzy','2.5 %']))
## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model.
## amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","w_pred"), noms=noms)
summarize.estimator <- function(df, suffix='naive', coefname='x'){
- part <- df[,c('N',
- 'm',
- 'Bxy',
- paste0('B',coefname,'y.est.',suffix),
- paste0('B',coefname,'y.ci.lower.',suffix),
- paste0('B',coefname,'y.ci.upper.',suffix),
- 'y_explained_variance',
- 'Bzx',
- 'Bzy',
- 'accuracy_imbalance_difference'
- ),
+ reported_vars <- c(
+ 'Bxy',
+ paste0('B',coefname,'y.est.',suffix),
+ paste0('B',coefname,'y.ci.lower.',suffix),
+ paste0('B',coefname,'y.ci.upper.',suffix)
+ )
+
+
+ grouping_vars <- c('N','m','B0', 'Bxy', 'Bzy', 'Bzx', 'Px', 'y_explained_variance', 'prediction_accuracy','outcome_formula','proxy_formula','truth_formula','z_bias','y_bias')
+
+ grouping_vars <- grouping_vars[grouping_vars %in% names(df)]
+
+ part <- df[,
+ c(reported_vars,
+ grouping_vars),
with=FALSE]
part.plot <- part[, .(p.true.in.ci = mean(true.in.ci),
mean.bias = mean(bias),
- mean.est = mean(.SD[[paste0('B',coefname,'y.est.',suffix)]]),
- var.est = var(.SD[[paste0('B',coefname,'y.est.',suffix)]]),
+ mean.est = mean(.SD[[paste0('B',coefname,'y.est.',suffix)]],na.rm=T),
+ var.est = var(.SD[[paste0('B',coefname,'y.est.',suffix)]],na.rm=T),
est.upper.95 = quantile(.SD[[paste0('B',coefname,'y.est.',suffix)]],0.975,na.rm=T),
est.lower.95 = quantile(.SD[[paste0('B',coefname,'y.est.',suffix)]],0.025,na.rm=T),
mean.ci.upper = mean(.SD[[paste0('B',coefname,'y.ci.upper.',suffix)]],na.rm=T),
variable=coefname,
method=suffix
),
- by=c("N","m",'y_explained_variance','Bzx', 'Bzy', 'accuracy_imbalance_difference')
+ by=grouping_vars,
]
return(part.plot)