X-Git-Url: https://code.communitydata.science/ml_measurement_error_public.git/blobdiff_plain/d8bc08f18f8c2128369ee959196e0e6080a11689..fa05dbab6bd2c5db6ed4eccf38cff03bb4fd6683:/civil_comments/01_dv_example.R diff --git a/civil_comments/01_dv_example.R b/civil_comments/01_dv_example.R new file mode 100644 index 0000000..4092243 --- /dev/null +++ b/civil_comments/01_dv_example.R @@ -0,0 +1,54 @@ +source('load_perspective_data.R') +source("../simulations/measerr_methods.R") +source("../simulations/RemembR/R/RemembeR.R") + +change.remember.file("dv_perspective_example.RDS") + +# for reproducibility +set.seed(1111) + +## another simple enough example: is P(toxic | funny and white) > P(toxic | funny nand white)? Or, are funny comments more toxic when people disclose that they are white? + +compare_dv_models <-function(pred_formula, outcome_formula, proxy_formula, df, sample.prop, remember_prefix){ + pred_model <- glm(pred_formula, df, family=binomial(link='logit')) + + remember(coef(pred_model), paste0(remember_prefix, "coef_pred_model")) + remember(diag(vcov((pred_model))), paste0(remember_prefix, "se_pred_model")) + + coder_model <- glm(outcome_formula, df, family=binomial(link='logit')) + remember(coef(coder_model), paste0(remember_prefix, "coef_coder_model")) + remember(diag(vcov((coder_model))), paste0(remember_prefix, "se_coder_model")) + + df_measerr_method <- copy(df)[sample(1:.N, sample.prop * .N), toxicity_coded_1 := toxicity_coded] + df_measerr_method <- df_measerr_method[,toxicity_coded := toxicity_coded_1] + sample_model <- glm(outcome_formula, df_measerr_method, family=binomial(link='logit')) + remember(coef(sample_model), paste0(remember_prefix, "coef_sample_model")) + remember(diag(vcov((sample_model))), paste0(remember_prefix, "se_sample_model")) + + measerr_model <- measerr_mle_dv(df_measerr_method, outcome_formula, outcome_family=binomial(link='logit'), proxy_formula=proxy_formula, proxy_family=binomial(link='logit')) + + inv_hessian = solve(measerr_model$hessian) + stderr = diag(inv_hessian) + remember(stderr, paste0(remember_prefix, "measerr_model_stderr")) + remember(measerr_model$par, paste0(remember_prefix, "measerr_model_par")) +} + +print("running first example") + +compare_dv_models(pred_formula = toxicity_pred ~ funny*white, + outcome_formula = toxicity_coded ~ funny*white, proxy_formula, + proxy_formula = toxicity_pred ~ toxicity_coded*funny*white, + df=df, + sample.prop=0.01, + remember_prefix='cc_ex_tox.funny.white') + + +print("running second example") + +compare_dv_models(pred_formula = toxicity_pred ~ likes+race_disclosed, + outcome_formula = toxicity_coded ~ likes + race_disclosed, proxy_formula, + proxy_formula = toxicity_pred ~ toxicity_coded*likes*race_disclosed, + df=df, + sample.prop=0.01, + remember_prefix='cc_ex_tox.funny.race_disclosed') +