source("../simulations/RemembR/R/RemembeR.R") change.remember.file("iv_perspective_example.RDS") source('load_perspective_data.R') source("../simulations/measerr_methods.R") remember(accuracies, "civil_comments_accuracies") remember(f1s, "civil_comments_f1s") remember(positive_cases, "civil_comments_positive_cases") remember(proportions_cases, "civil_comments_proportions_cases") remember(cortab, "civil_comments_cortab") remember(nrow(df), 'n.annotated.comments') # for reproducibility set.seed(1) ## another simple enough example: is P(toxic | funny and white) > P(toxic | funny nand white)? Or, are funny comments more toxic when people disclose that they are white? compare_iv_models <-function(pred_formula, outcome_formula, proxy_formula, truth_formula, df, sample.prop, sample.size, remember_prefix){ if(is.null(sample.prop)){ sample.prop <- sample.size / nrow(df) } if(is.null(sample.size)){ sample.size <- nrow(df) * sample.prop } remember(sample.size, paste0(remember_prefix, "sample.size")) remember(sample.prop, paste0(remember_prefix, "sample.prop")) remember(pred_formula, paste0(remember_prefix, "pred_formula")) remember(outcome_formula, paste0(remember_prefix, 'outcome_formula')) remember(proxy_formula, paste0(remember_prefix, 'proxy_formula')) remember(truth_formula, paste0(remember_prefix, 'truth_formula')) pred_model <- glm(pred_formula, df, family=binomial(link='logit')) remember(coef(pred_model), paste0(remember_prefix, "coef_pred_model")) remember(diag(vcov((pred_model))), paste0(remember_prefix, "se_pred_model")) coder_model <- glm(outcome_formula, df, family=binomial(link='logit')) remember(coef(coder_model), paste0(remember_prefix, "coef_coder_model")) remember(diag(vcov((coder_model))), paste0(remember_prefix, "se_coder_model")) df_measerr_method <- copy(df)[sample(1:.N, sample.size), toxicity_coded_1 := toxicity_coded] df_measerr_method <- df_measerr_method[,toxicity_coded := toxicity_coded_1] sample_model <- glm(outcome_formula, df_measerr_method, family=binomial(link='logit')) remember(coef(sample_model), paste0(remember_prefix, "coef_sample_model")) remember(diag(vcov((sample_model))), paste0(remember_prefix, "se_sample_model")) measerr_model <- measerr_mle(df_measerr_method, outcome_formula, outcome_family=binomial(link='logit'), proxy_formula=proxy_formula, proxy_family=binomial(link='logit'),truth_formula=truth_formula, truth_family=binomial(link='logit')) inv_hessian = solve(measerr_model$hessian) stderr = diag(inv_hessian) remember(stderr, paste0(remember_prefix, "measerr_model_stderr")) remember(measerr_model$par, paste0(remember_prefix, "measerr_model_par")) } ## print("running first iv example") ## sample.prop <- 0.05 ## compare_iv_models(white ~ toxicity_pred*funny, ## outcome_formula = white ~ toxicity_coded*funny, ## proxy_formula = toxicity_pred ~ toxicity_coded*funny*white, ## truth_formula = toxicity_coded ~ 1, ## df=df, ## sample.prop=sample.prop, ## remember_prefix='cc_ex_tox.funny.white') pred_formula <- race_disclosed ~ likes * toxicity_pred outcome_formula <- race_disclosed ~ likes * toxicity_coded proxy_formula <- toxicity_pred ~ toxicity_coded * race_disclosed * likes truth_formula <- toxicity_coded ~ 1 print("running first example") compare_iv_models(pred_formula = pred_formula, outcome_formula = outcome_formula, proxy_formula = proxy_formula, truth_formula = truth_formula, df=df, sample.prop=0.01, sample.size=NULL, remember_prefix='cc_ex_tox.likes.race_disclosed') print("running second example") compare_iv_models(pred_formula = pred_formula, outcome_formula = outcome_formula, proxy_formula = proxy_formula, truth_formula = truth_formula, df=df, sample.prop=NULL, sample.size=10000, remember_prefix='cc_ex_tox.likes.race_disclosed.medsamp') print("running third example") compare_iv_models(pred_formula = race_disclosed ~ likes * toxicity_pred, outcome_formula = race_disclosed ~ likes * toxicity_coded, proxy_formula = toxicity_pred ~ toxicity_coded + race_disclosed, truth_formula = toxicity_coded ~ 1, df=df, sample.prop=0.05, sample.size=NULL, remember_prefix='cc_ex_tox.likes.race_disclosed.largesamp')