X-Git-Url: https://code.communitydata.science/ml_measurement_error_public.git/blobdiff_plain/c066f900d3503951013185f75d1f14436eb042cb..c45ea9dfebca86dfddc1e9237aa74866c5166519:/civil_comments/02_iv_example.R diff --git a/civil_comments/02_iv_example.R b/civil_comments/02_iv_example.R new file mode 100644 index 0000000..ba5ab12 --- /dev/null +++ b/civil_comments/02_iv_example.R @@ -0,0 +1,107 @@ +source("../simulations/RemembR/R/RemembeR.R") +change.remember.file("iv_perspective_example.RDS") + +source('load_perspective_data.R') +source("../simulations/measerr_methods.R") + +remember(accuracies, "civil_comments_accuracies") +remember(f1s, "civil_comments_f1s") +remember(positive_cases, "civil_comments_positive_cases") +remember(proportions_cases, "civil_comments_proportions_cases") +remember(cortab, "civil_comments_cortab") +remember(nrow(df), 'n.annotated.comments') +# for reproducibility +set.seed(1) + +## another simple enough example: is P(toxic | funny and white) > P(toxic | funny nand white)? Or, are funny comments more toxic when people disclose that they are white? + +compare_iv_models <-function(pred_formula, outcome_formula, proxy_formula, truth_formula, df, sample.prop, sample.size, remember_prefix){ + + if(is.null(sample.prop)){ + sample.prop <- sample.size / nrow(df) + } + if(is.null(sample.size)){ + sample.size <- nrow(df) * sample.prop + } + + remember(sample.size, paste0(remember_prefix, "sample.size")) + remember(sample.prop, paste0(remember_prefix, "sample.prop")) + remember(pred_formula, paste0(remember_prefix, "pred_formula")) + remember(outcome_formula, paste0(remember_prefix, 'outcome_formula')) + remember(proxy_formula, paste0(remember_prefix, 'proxy_formula')) + remember(truth_formula, paste0(remember_prefix, 'truth_formula')) + + pred_model <- glm(pred_formula, df, family=binomial(link='logit')) + remember(coef(pred_model), paste0(remember_prefix, "coef_pred_model")) + remember(diag(vcov((pred_model))), paste0(remember_prefix, "se_pred_model")) + + coder_model <- glm(outcome_formula, df, family=binomial(link='logit')) + remember(coef(coder_model), paste0(remember_prefix, "coef_coder_model")) + remember(diag(vcov((coder_model))), paste0(remember_prefix, "se_coder_model")) + + df_measerr_method <- copy(df)[sample(1:.N, sample.size), toxicity_coded_1 := toxicity_coded] + df_measerr_method <- df_measerr_method[,toxicity_coded := toxicity_coded_1] + sample_model <- glm(outcome_formula, df_measerr_method, family=binomial(link='logit')) + remember(coef(sample_model), paste0(remember_prefix, "coef_sample_model")) + remember(diag(vcov((sample_model))), paste0(remember_prefix, "se_sample_model")) + + measerr_model <- measerr_mle(df_measerr_method, outcome_formula, outcome_family=binomial(link='logit'), proxy_formula=proxy_formula, proxy_family=binomial(link='logit'),truth_formula=truth_formula, truth_family=binomial(link='logit')) + + inv_hessian = solve(measerr_model$hessian) + stderr = diag(inv_hessian) + remember(stderr, paste0(remember_prefix, "measerr_model_stderr")) + remember(measerr_model$par, paste0(remember_prefix, "measerr_model_par")) +} + +## print("running first iv example") + +## sample.prop <- 0.05 + +## compare_iv_models(white ~ toxicity_pred*funny, +## outcome_formula = white ~ toxicity_coded*funny, +## proxy_formula = toxicity_pred ~ toxicity_coded*funny*white, +## truth_formula = toxicity_coded ~ 1, +## df=df, +## sample.prop=sample.prop, +## remember_prefix='cc_ex_tox.funny.white') + + + +pred_formula <- race_disclosed ~ likes * toxicity_pred +outcome_formula <- race_disclosed ~ likes * toxicity_coded +proxy_formula <- toxicity_pred ~ toxicity_coded * race_disclosed * likes +truth_formula <- toxicity_coded ~ 1 + +print("running first example") + +compare_iv_models(pred_formula = pred_formula, + outcome_formula = outcome_formula, + proxy_formula = proxy_formula, + truth_formula = truth_formula, + df=df, + sample.prop=0.01, + sample.size=NULL, + remember_prefix='cc_ex_tox.likes.race_disclosed') + +print("running second example") + +compare_iv_models(pred_formula = pred_formula, + outcome_formula = outcome_formula, + proxy_formula = proxy_formula, + truth_formula = truth_formula, + df=df, + sample.prop=NULL, + sample.size=10000, + remember_prefix='cc_ex_tox.likes.race_disclosed.medsamp') + +print("running third example") + +compare_iv_models(pred_formula = race_disclosed ~ likes * toxicity_pred, + outcome_formula = race_disclosed ~ likes * toxicity_coded, + proxy_formula = toxicity_pred ~ toxicity_coded + race_disclosed, + truth_formula = toxicity_coded ~ 1, + df=df, + sample.prop=0.05, + sample.size=NULL, + remember_prefix='cc_ex_tox.likes.race_disclosed.largesamp') +