Make summarize estimator group correctly for robustness checks.

author Nathan TeBlunthuis <nathante@uw.edu>

Sat, 11 Feb 2023 20:26:48 +0000 (12:26 -0800)

committer Nathan TeBlunthuis <nathante@uw.edu>

Sat, 11 Feb 2023 20:26:48 +0000 (12:26 -0800)
author Nathan TeBlunthuis <nathante@uw.edu>
Sat, 11 Feb 2023 20:26:48 +0000 (12:26 -0800)
committer Nathan TeBlunthuis <nathante@uw.edu>
Sat, 11 Feb 2023 20:26:48 +0000 (12:26 -0800)
diff --git a/Makefile b/Makefile

new file mode 100644 (file)

index 0000000..4efbae7
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,3 @@
+all:
+       +$(MAKE) -C simulations
+       +$(MAKE) -C civil_comments
diff --git a/simulations/Makefile b/simulations/Makefile

index 1cab47320e5238ff9afa7b6938003a56136aa34f..3e8fdd5c24a391171bf1f9e47de85140b9c96515 100644 (file)
--- a/simulations/Makefile
+++ b/simulations/Makefile
@@ -36,7 +36,7 @@ example_1.feather: example_1_jobs
         sbatch --wait --verbose --array=4001-$(shell cat example_1_jobs | wc -l)  run_simulation.sbatch 0 example_1_jobs
  
  example_2_jobs: 02_indep_differential.R simulation_base.R grid_sweep.py pl_methods.R
-       sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 02_indep_differential.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_2.feather"],"y_explained_variance":${explained_variances},  "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[1], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y*z*x"]}' --outfile example_2_jobs
+       sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 02_indep_differential.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_2.feather"],"y_explained_variance":${explained_variances},  "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[1], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+z+x"]}' --outfile example_2_jobs
  
  example_2.feather: example_2_jobs 
         rm -f example_2.feather
@@ -66,10 +66,10 @@ example_3.feather: example_3_jobs
         sbatch --wait --verbose --array=4001-$(shell cat example_3_jobs | wc -l)  run_simulation.sbatch 0 example_3_jobs
  
  example_4_jobs: 04_depvar_differential.R simulation_base.R grid_sweep.py pl_methods.R
-       sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 04_depvar_differential.R" --arg_dict '{"N":${Ns},"Bxy":[0.7],"Bzy":[-0.7],"m":${ms}, "seed":${seeds}, "outfile":["example_4.feather"], "z_bias":[0.5]}' --outfile example_4_jobs
+       sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 04_depvar_differential.R" --arg_dict '{"N":${Ns},"Bxy":[0.7],"Bzy":[-0.7],"m":${ms}, "seed":${seeds}, "outfile":["example_4.feather"], "z_bias":[0.3], "prediction_accuracy":[0.73]}' --outfile example_4_jobs
  
  example_4.feather: example_4_jobs
-       rm -f example_4.feather 
+       rm -f example_4.feather
         sbatch --wait --verbose --array=1-1000  run_simulation.sbatch 0 example_4_jobs
         sbatch --wait --verbose --array=1001-2000  run_simulation.sbatch 0 example_4_jobs
         sbatch --wait --verbose --array=2001-3000  run_simulation.sbatch 0 example_4_jobs
@@ -86,41 +86,6 @@ remembr.RDS:example_1.feather example_2.feather example_3.feather example_4.feat
         ${srun} Rscript plot_dv_example.R --infile example_4.feather --name "plot.df.example.4"
  
  
-irr_Ns = [1000]
-irr_ms = [150,300,600]
-irr_seeds=${seeds}
-irr_explained_variances=${explained_variances}
-irr_coder_accuracy=[0.80]
-
-example_5_jobs: 05_irr_indep.R irr_simulation_base.R grid_sweep.py pl_methods.R measerr_methods.R
-       sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 05_irr_indep.R" --arg_dict '{"N":${irr_Ns},"m":${irr_ms}, "seed":${irr_seeds}, "outfile":["example_5.feather"], "y_explained_variance":${irr_explained_variances}, "coder_accuracy":${irr_coder_accuracy}}' --outfile example_5_jobs
-
-example_5.feather:example_5_jobs
-       rm -f example_5.feather
-       sbatch --wait --verbose --array=1-1000  run_simulation.sbatch 0 example_5_jobs
-       sbatch --wait --verbose --array=1001-$(shell cat example_5_jobs | wc -l)  run_simulation.sbatch 1000 example_5_jobs
-       # sbatch --wait --verbose --array=2001-3000  run_simulation.sbatch 2000 example_5_jobs
-       # sbatch --wait --verbose --array=3001-4000  run_simulation.sbatch 3000 example_5_jobs
-       # sbatch --wait --verbose --array=2001-$(shell cat example_5_jobs | wc -l)  run_simulation.sbatch 4000 example_5_jobs
-
-
-
-# example_6_jobs: 06_irr_dv.R irr_dv_simulation_base.R grid_sweep.py pl_methods.R
-#      sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 06_irr_dv.R" --arg_dict '{"N":${irr_Ns},"m":${irr_ms}, "seed":${irr_seeds}, "outfile":["example_6.feather"], "y_explained_variance":${irr_explained_variances},"coder_accuracy":${irr_coder_accuracy}}' --outfile example_6_jobs
-
-# example_6.feather:example_6_jobs
-#      rm -f example_6.feather
-#      sbatch --wait --verbose --array=1-1000  run_simulation.sbatch 0 example_6_jobs
-#      sbatch --wait --verbose --array=1001-2000  run_simulation.sbatch 1000 example_6_jobs
-#      sbatch --wait --verbose --array=2001-$(shell cat example_6_jobs | wc -l)  run_simulation.sbatch 2000 example_6_jobs
-
-
-remember_irr.RDS: example_5.feather plot_irr_example.R plot_irr_dv_example.R summarize_estimator.R
-       rm -f remember_irr.RDS
-       sbatch --wait --verbose run_job.sbatch Rscript plot_irr_example.R --infile example_5.feather --name "plot.df.example.5"
-#      sbatch --wait --verbose run_job.sbatch Rscript plot_irr_dv_example.R --infile example_6.feather --name "plot.df.example.6"
-
-
  robustness_1_jobs: 02_indep_differential.R simulation_base.R grid_sweep.py
         sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 02_indep_differential.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_1.feather"],"y_explained_variance":${explained_variances},  "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~1"]}' --outfile robustness_1_jobs
  
@@ -210,7 +175,7 @@ robustness_2_dv_jobs_p3: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.
  
  robustness_2_dv_jobs_p4: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py
         rm -f $@
-       ${srun} $< --command 'Rscript 01_two_covariates.R'  --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2_dv.feather"],"y_explained_variance":${explained_variances},  "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.90,0.95]}' --outfile $@
+       ${srun} $< --command 'Rscript 03_depvar.R'  --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2_dv.feather"],"y_explained_variance":${explained_variances},  "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.90,0.95]}' --outfile $@
  
  robustness_2_dv.feather: robustness_2_dv_jobs_p1 robustness_2_dv_jobs_p2 robustness_2_dv_jobs_p3 robustness_2_dv_jobs_p4
         $(eval END_1!=cat robustness_2_dv_jobs_p1 | wc -l)
@@ -361,8 +326,22 @@ robustness_4_dv.RDS: plot_dv_example.R robustness_4_dv.feather
         rm -f $@
         ${srun} Rscript $< --infile $(word 2, $^) --name "robustness_4" --remember-file $@
  
+
+
+clean_main:
+       rm -f remembr.RDS
+       rm -f example_1_jobs
+       rm -f example_2_jobs
+       rm -f example_3_jobs
+       rm -f example_4_jobs
+       rm -f example_1.feather
+       rm -f example_2.feather
+       rm -f example_3.feather
+       rm -f example_4.feather
+
+
  #      
-clean:
+clean_all:
         rm *.feather
         rm -f remembr.RDS
         rm -f remembr*.RDS
diff --git a/simulations/robustness_check_notes.md b/simulations/robustness_check_notes.md

index 64a472d17bd54b14e426d943952f2122ba4cb4ea..ac7e88fac4ab937b5e274de557606eb702702310 100644 (file)
--- a/simulations/robustness_check_notes.md
+++ b/simulations/robustness_check_notes.md
@@ -2,11 +2,10 @@
  
  Tests how robust the MLE method for independent variables with differential error is when the model for $X$ is less precise. In the main paper, we include $Z$ on the right-hand-side of the `truth_formula`. 
  In this robustness check, the `truth_formula` is an intercept-only model.
-The stats are in the list named `robustness_1` in the `.RDS` file.
-
+The stats are in the list named `robustness_1` in the `.RDS` 
  # robustness\_1\_dv.RDS
  
-Like `robustness\_1.RDS` but with a less precise model for $w_pred$.  In the main paper, we included $Z$ in the `outcome_formula`. In this robustness check, we do not.
+Like `robustness\_1.RDS` but with a less precise model for $w_pred$.  In the main paper, we included $Z$ in the `proxy_formula`. In this robustness check, we do not.
  
  # robustness_2.RDS
  
diff --git a/simulations/simulation_base.R b/simulations/simulation_base.R

index e715edfaf61b9b1f423001808187ad66828a43b0..73544e9aee194800d6ae7a9dd0e4f279db978869 100644 (file)
--- a/simulations/simulation_base.R
+++ b/simulations/simulation_base.R
@@ -280,79 +280,83 @@ run_simulation <-  function(df, result, outcome_formula=y~x+z, proxy_formula=NUL
                                    Bxy.ci.lower.naive = naive.ci.Bxy[1],
                                    Bzy.ci.upper.naive = naive.ci.Bzy[2],
                                    Bzy.ci.lower.naive = naive.ci.Bzy[1]))
-                                  
  
+    amelia_result <- list(
+        Bxy.est.amelia.full = NULL,
+        Bxy.ci.upper.amelia.full = NULL,
+        Bxy.ci.lower.amelia.full = NULL,
+        Bzy.est.amelia.full = NULL,
+        Bzy.ci.upper.amelia.full = NULL,
+        Bzy.ci.lower.amelia.full = NULL
+        )
  
-    amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w'))
-    mod.amelia.k <- zelig(y~x.obs+z, model='ls', data=amelia.out.k$imputations, cite=FALSE)
-    (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE))
+    tryCatch({
+        amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w'))
+        mod.amelia.k <- zelig(y~x.obs+z, model='ls', data=amelia.out.k$imputations, cite=FALSE)
+        (coefse <- combine_coef_se(mod.amelia.k))
  
-    est.x.mi <- coefse['x.obs','Estimate']
-    est.x.se <- coefse['x.obs','Std.Error']
-    result <- append(result,
-                     list(Bxy.est.amelia.full = est.x.mi,
-                          Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
-                          Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
-                          ))
+        est.x.mi <- coefse['x.obs','Estimate']
+        est.x.se <- coefse['x.obs','Std.Error']
+        est.z.mi <- coefse['z','Estimate']
+        est.z.se <- coefse['z','Std.Error']
  
-    est.z.mi <- coefse['z','Estimate']
-    est.z.se <- coefse['z','Std.Error']
+        amelia_result <- list(Bxy.est.amelia.full = est.x.mi,
+                              Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
+                              Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se,
+                              Bzy.est.amelia.full = est.z.mi,
+                              Bzy.ci.upper.amelia.full = est.z.mi + 1.96 * est.z.se,
+                              Bzy.ci.lower.amelia.full = est.z.mi - 1.96 * est.z.se
+                              )
  
-    result <- append(result,
-                     list(Bzy.est.amelia.full = est.z.mi,
-                          Bzy.ci.upper.amelia.full = est.z.mi + 1.96 * est.z.se,
-                          Bzy.ci.lower.amelia.full = est.z.mi - 1.96 * est.z.se
-                          ))
+    },
  
+    error = function(e){
+        result[['error']] <- e}
+    )
  
-        temp.df <- copy(df)
-        temp.df <- temp.df[,x:=x.obs]
-        mod.caroll.lik <- measerr_mle(temp.df, outcome_formula=outcome_formula, proxy_formula=proxy_formula, truth_formula=truth_formula)
-
-    ## tryCatch({
-    ## mod.calibrated.mle <- mecor(y ~ MeasError(w_pred, reference = x.obs) + z, df, B=400, method='efficient')
-    ## (mod.calibrated.mle)
-    ## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',])
-    ## result <- append(result, list(
-    ##                              Bxy.est.mecor = mecor.ci['Estimate'],
-    ##                              Bxy.ci.upper.mecor = mecor.ci['UCI'],
-    ##                              Bxy.ci.lower.mecor = mecor.ci['LCI'])
-    ##                  )
  
+    result <- append(result, amelia_result)
  
  
-    fischer.info <- NA
-    ci.upper <- NA
-    ci.lower <- NA
+   mle_result <- list(Bxy.est.mle = NULL,
+                      Bxy.ci.upper.mle = NULL,
+                      Bxy.ci.lower.mle = NULL,
+                      Bzy.est.mle = NULL,
+                      Bzy.ci.upper.mle = NULL,
+                      Bzy.ci.lower.mle = NULL)
  
-    tryCatch({fischer.info <- solve(mod.caroll.lik$hessian)
+    tryCatch({
+        temp.df <- copy(df)
+        temp.df <- temp.df[,x:=x.obs]
+        mod.caroll.lik <- measerr_mle(temp.df, outcome_formula=outcome_formula, proxy_formula=proxy_formula, truth_formula=truth_formula)
+        fischer.info <- solve(mod.caroll.lik$hessian)
+        coef <- mod.caroll.lik$par
          ci.upper <- coef + sqrt(diag(fischer.info)) * 1.96
          ci.lower <- coef - sqrt(diag(fischer.info)) * 1.96
+        mle_result <- list(Bxy.est.mle = coef['x'],
+                           Bxy.ci.upper.mle = ci.upper['x'],
+                           Bxy.ci.lower.mle = ci.lower['x'],
+                           Bzy.est.mle = coef['z'],
+                           Bzy.ci.upper.mle = ci.upper['z'],
+                           Bzy.ci.lower.mle = ci.lower['z'])
      },
  
      error=function(e) {result[['error']] <- as.character(e)
      })
  
-    coef <- mod.caroll.lik$par
          
-        result <- append(result,
-                         list(Bxy.est.mle = coef['x'],
-                              Bxy.ci.upper.mle = ci.upper['x'],
-                              Bxy.ci.lower.mle = ci.lower['x'],
-                              Bzy.est.mle = coef['z'],
-                              Bzy.ci.upper.mle = ci.upper['z'],
-                              Bzy.ci.lower.mle = ci.lower['z']))
-
-        mod.zhang.lik <- zhang.mle.iv(df)
-        coef <- coef(mod.zhang.lik)
-        ci <- confint(mod.zhang.lik,method='quad')
-        result <- append(result,
-                         list(Bxy.est.zhang = coef['Bxy'],
-                              Bxy.ci.upper.zhang = ci['Bxy','97.5 %'],
-                              Bxy.ci.lower.zhang = ci['Bxy','2.5 %'],
-                              Bzy.est.zhang = coef['Bzy'],
-                              Bzy.ci.upper.zhang = ci['Bzy','97.5 %'],
-                              Bzy.ci.lower.zhang = ci['Bzy','2.5 %']))
+    result <- append(result, mle_result)
+
+    mod.zhang.lik <- zhang.mle.iv(df)
+    coef <- coef(mod.zhang.lik)
+    ci <- confint(mod.zhang.lik,method='quad')
+    result <- append(result,
+                     list(Bxy.est.zhang = coef['Bxy'],
+                          Bxy.ci.upper.zhang = ci['Bxy','97.5 %'],
+                          Bxy.ci.lower.zhang = ci['Bxy','2.5 %'],
+                          Bzy.est.zhang = coef['Bzy'],
+                          Bzy.ci.upper.zhang = ci['Bzy','97.5 %'],
+                          Bzy.ci.lower.zhang = ci['Bzy','2.5 %']))
  
      ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model.
      ## amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","w_pred"), noms=noms)
diff --git a/simulations/summarize_estimator.R b/simulations/summarize_estimator.R

index 3e4209f42f4c3486a89c1499e221c297b424b643..f416c5b30bbaf9e664499338fc002bdad42686e0 100644 (file)
--- a/simulations/summarize_estimator.R
+++ b/simulations/summarize_estimator.R
@@ -1,17 +1,21 @@
  
  summarize.estimator <- function(df, suffix='naive', coefname='x'){
  
-    part <- df[,c('N',
-                  'm',
-                  'Bxy',
-                  paste0('B',coefname,'y.est.',suffix),
-                  paste0('B',coefname,'y.ci.lower.',suffix),
-                  paste0('B',coefname,'y.ci.upper.',suffix),
-                  'y_explained_variance',
-                  'Bzx',
-                  'Bzy',
-                  'accuracy_imbalance_difference'
-                  ),
+    reported_vars <- c(
+                       'Bxy',
+                       paste0('B',coefname,'y.est.',suffix),
+                       paste0('B',coefname,'y.ci.lower.',suffix),
+                       paste0('B',coefname,'y.ci.upper.',suffix)
+                       )
+
+    
+    grouping_vars <- c('N','m','B0', 'Bxy', 'Bzy', 'Bzx', 'Px', 'y_explained_variance', 'prediction_accuracy','outcome_formula','proxy_formula','truth_formula','z_bias','y_bias')
+
+    grouping_vars <- grouping_vars[grouping_vars %in% names(df)]
+
+    part <- df[,
+               c(reported_vars,
+                 grouping_vars),
                 with=FALSE]
  
  
@@ -27,8 +31,8 @@ summarize.estimator <- function(df, suffix='naive', coefname='x'){
  
      part.plot <- part[, .(p.true.in.ci = mean(true.in.ci),
                            mean.bias = mean(bias),
-                          mean.est = mean(.SD[[paste0('B',coefname,'y.est.',suffix)]]),
-                          var.est = var(.SD[[paste0('B',coefname,'y.est.',suffix)]]),
+                          mean.est = mean(.SD[[paste0('B',coefname,'y.est.',suffix)]],na.rm=T),
+                          var.est = var(.SD[[paste0('B',coefname,'y.est.',suffix)]],na.rm=T),
                            est.upper.95 = quantile(.SD[[paste0('B',coefname,'y.est.',suffix)]],0.975,na.rm=T),
                            est.lower.95 = quantile(.SD[[paste0('B',coefname,'y.est.',suffix)]],0.025,na.rm=T),
                            mean.ci.upper = mean(.SD[[paste0('B',coefname,'y.ci.upper.',suffix)]],na.rm=T),
@@ -43,7 +47,7 @@ summarize.estimator <- function(df, suffix='naive', coefname='x'){
                            variable=coefname,
                            method=suffix
                            ),
-                      by=c("N","m",'y_explained_variance','Bzx', 'Bzy', 'accuracy_imbalance_difference')
+                      by=grouping_vars,
                        ]
      
      return(part.plot)
author	Nathan TeBlunthuis <nathante@uw.edu>
	Sat, 11 Feb 2023 20:26:48 +0000 (12:26 -0800)
committer	Nathan TeBlunthuis <nathante@uw.edu>
	Sat, 11 Feb 2023 20:26:48 +0000 (12:26 -0800)
Makefile	[new file with mode: 0644]	patch \| blob
simulations/Makefile		patch \| blob \| history
simulations/robustness_check_notes.md		patch \| blob \| history
simulations/simulation_base.R		patch \| blob \| history
simulations/summarize_estimator.R		patch \| blob \| history