From: Nathan TeBlunthuis Date: Sun, 11 Dec 2022 20:54:34 +0000 (-0800) Subject: cleaning up + implementing robustness checks X-Git-Url: https://code.communitydata.science/ml_measurement_error_public.git/commitdiff_plain/82fe7b0f482a71c95e8ae99f7e6d37b79357506a?ds=inline;hp=3d1964b806106d76f13301f0cf6dccf35cd7d66c cleaning up + implementing robustness checks + add pl_methods.R + update makefile + fix bug in 02_indep_differential.R + start documenting robustness checks in robustness_check_notes.md --- diff --git a/simulations/02_indep_differential.R b/simulations/02_indep_differential.R index 6e2732f..5d34312 100644 --- a/simulations/02_indep_differential.R +++ b/simulations/02_indep_differential.R @@ -104,9 +104,10 @@ simulate_data <- function(N, m, B0, Bxy, Bzx, Bzy, seed, y_explained_variance=0. ## print(mean(df[z==1]$x == df[z==1]$w_pred)) ## print(mean(df$w_pred == df$x)) + resids <- resid(lm(y~x + z)) - odds.x1 <- qlogis(prediction_accuracy) + y_bias*qlogis(pnorm(resids[x==1])) + z_bias * qlogis(pnorm(z,sd(z))) - odds.x0 <- qlogis(prediction_accuracy,lower.tail=F) + y_bias*qlogis(pnorm(resids[x==0])) + z_bias * qlogis(pnorm(z,sd(z))) + odds.x1 <- qlogis(prediction_accuracy) + y_bias*qlogis(pnorm(resids[x==1])) + z_bias * qlogis(pnorm(z[x==1],sd(z))) + odds.x0 <- qlogis(prediction_accuracy,lower.tail=F) + y_bias*qlogis(pnorm(resids[x==0])) + z_bias * qlogis(pnorm(z[x==0],sd(z))) ## acc.x0 <- p.correct[df[,x==0]] ## acc.x1 <- p.correct[df[,x==1]] diff --git a/simulations/Makefile b/simulations/Makefile index b3ab77a..e595811 100644 --- a/simulations/Makefile +++ b/simulations/Makefile @@ -120,12 +120,10 @@ remember_irr.RDS: example_5.feather plot_irr_example.R plot_irr_dv_example.R sum # sbatch --wait --verbose run_job.sbatch Rscript plot_irr_dv_example.R --infile example_6.feather --name "plot.df.example.6" - robustness_1_jobs: 02_indep_differential.R simulation_base.R grid_sweep.py sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 02_indep_differential.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_1.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~1"]}' --outfile robustness_1_jobs - robustness_1.feather: robustness_1_jobs rm -f robustness_1.feather sbatch --wait --verbose --array=1-$(shell cat robustness_1_jobs | wc -l) run_simulation.sbatch 0 robustness_1_jobs diff --git a/simulations/pl_methods.R b/simulations/pl_methods.R new file mode 100644 index 0000000..b3007d1 --- /dev/null +++ b/simulations/pl_methods.R @@ -0,0 +1,84 @@ +library(stats4) +library(bbmle) +library(matrixStats) + +zhang.mle.dv <- function(df){ + df.obs <- df[!is.na(y.obs)] + df.unobs <- df[is.na(y.obs)] + + fp <- df.obs[(w_pred==1) & (y.obs != w_pred),.N] + tn <- df.obs[(w_pred == 0) & (y.obs == w_pred),.N] + fpr <- fp / (fp+tn) + + fn <- df.obs[(w_pred==0) & (y.obs != w_pred), .N] + tp <- df.obs[(w_pred==1) & (y.obs == w_pred),.N] + fnr <- fn / (fn+tp) + + nll <- function(B0=0, Bxy=0, Bzy=0){ + + + ## observed case + ll.y.obs <- vector(mode='numeric', length=nrow(df.obs)) + ll.y.obs[df.obs$y.obs==1] <- with(df.obs[y.obs==1], plogis(B0 + Bxy * x + Bzy * z,log=T)) + ll.y.obs[df.obs$y.obs==0] <- with(df.obs[y.obs==0], plogis(B0 + Bxy * x + Bzy * z,log=T,lower.tail=FALSE)) + + ll <- sum(ll.y.obs) + + pi.y.1 <- with(df.unobs,plogis(B0 + Bxy * x + Bzy*z, log=T)) + #pi.y.0 <- with(df.unobs,plogis(B0 + Bxy * x + Bzy*z, log=T,lower.tail=FALSE)) + + lls <- with(df.unobs, colLogSumExps(rbind(w_pred * colLogSumExps(rbind(log(fpr), log(1 - fnr - fpr)+pi.y.1)), + (1-w_pred) * (log(1-fpr) - exp(log(1-fnr-fpr)+pi.y.1))))) + + ll <- ll + sum(lls) + print(paste0(B0,Bxy,Bzy)) + print(ll) + return(-ll) + } + mlefit <- mle2(minuslogl = nll, control=list(maxit=1e6),method='L-BFGS-B',lower=c(B0=-Inf, Bxy=-Inf, Bzy=-Inf), + upper=c(B0=Inf, Bxy=Inf, Bzy=Inf)) + return(mlefit) +} + + +## model from Zhang's arxiv paper, with predictions for y +## Zhang got this model from Hausman 1998 +zhang.mle.iv <- function(df){ + df.obs <- df[!is.na(x.obs)] + df.unobs <- df[is.na(x.obs)] + + tn <- df.obs[(w_pred == 0) & (x.obs == w_pred),.N] + fn <- df.obs[(w_pred==0) & (x.obs==1), .N] + npv <- tn / (tn + fn) + + tp <- df.obs[(w_pred==1) & (x.obs == w_pred),.N] + fp <- df.obs[(w_pred==1) & (x.obs == 0),.N] + ppv <- tp / (tp + fp) + + nll <- function(B0=0, Bxy=0, Bzy=0, sigma_y=0.1){ + + ## fpr = 1 - TNR + ### Problem: accounting for uncertainty in ppv / npv + + ## fnr = 1 - TPR + ll.y.obs <- with(df.obs, dnorm(y, B0 + Bxy * x + Bzy * z, sd=sigma_y,log=T)) + ll <- sum(ll.y.obs) + + # unobserved case; integrate out x + ll.x.1 <- with(df.unobs, dnorm(y, B0 + Bxy + Bzy * z, sd = sigma_y, log=T)) + ll.x.0 <- with(df.unobs, dnorm(y, B0 + Bzy * z, sd = sigma_y,log=T)) + + ## case x == 1 + lls.x.1 <- colLogSumExps(rbind(log(ppv) + ll.x.1, log(1-ppv) + ll.x.0)) + + ## case x == 0 + lls.x.0 <- colLogSumExps(rbind(log(1-npv) + ll.x.1, log(npv) + ll.x.0)) + + lls <- colLogSumExps(rbind(df.unobs$w_pred * lls.x.1, (1-df.unobs$w_pred) * lls.x.0)) + ll <- ll + sum(lls) + return(-ll) + } + mlefit <- mle2(minuslogl = nll, control=list(maxit=1e6), lower=list(sigma_y=0.0001, B0=-Inf, Bxy=-Inf, Bzy=-Inf), + upper=list(sigma_y=Inf, B0=Inf, Bxy=Inf, Bzy=Inf),method='L-BFGS-B') + return(mlefit) +} diff --git a/simulations/robustness_check_notes.md b/simulations/robustness_check_notes.md new file mode 100644 index 0000000..e6adc8a --- /dev/null +++ b/simulations/robustness_check_notes.md @@ -0,0 +1,5 @@ +# robustness_1.RDS + +Tests how robust the MLE method is when the model for $X$ is less precise. In the main result, we include $Z$ on the right-hand-side of the `truth_formula`. +In this robustness check, the `truth_formula` is an intercept-only model. +