X-Git-Url: https://code.communitydata.science/ml_measurement_error_public.git/blobdiff_plain/fa05dbab6bd2c5db6ed4eccf38cff03bb4fd6683..c1dbbfd0dd88defca0ce00425910757e436284ad:/civil_comments/01_dv_example.R diff --git a/civil_comments/01_dv_example.R b/civil_comments/01_dv_example.R index 4092243..12af561 100644 --- a/civil_comments/01_dv_example.R +++ b/civil_comments/01_dv_example.R @@ -3,15 +3,33 @@ source("../simulations/measerr_methods.R") source("../simulations/RemembR/R/RemembeR.R") change.remember.file("dv_perspective_example.RDS") - +remember(accuracies, "civil_comments_accuracies") +remember(f1s, "civil_comments_f1s") +remember(positive_cases, "civil_comments_positive_cases") +remember(proportions_cases, "civil_comments_proportions_cases") +remember(cortab, "civil_comments_cortab") +remember(nrow(df), 'n.annotated.comments') # for reproducibility -set.seed(1111) +set.seed(111) ## another simple enough example: is P(toxic | funny and white) > P(toxic | funny nand white)? Or, are funny comments more toxic when people disclose that they are white? -compare_dv_models <-function(pred_formula, outcome_formula, proxy_formula, df, sample.prop, remember_prefix){ +compare_dv_models <-function(pred_formula, outcome_formula, proxy_formula, df, sample.prop, sample.size, remember_prefix){ + if(is.null(sample.prop)){ + sample.prop <- sample.size / nrow(df) + } + if(is.null(sample.size)){ + sample.size <- nrow(df) * sample.prop + } + pred_model <- glm(pred_formula, df, family=binomial(link='logit')) + remember(sample.size, paste0(remember_prefix, "sample.size")) + remember(sample.prop, paste0(remember_prefix, "sample.prop")) + remember(pred_formula, paste0(remember_prefix, "pred_formula")) + remember(outcome_formula, paste0(remember_prefix, 'outcome_formula')) + remember(proxy_formula, paste0(remember_prefix, 'proxy_formula')) + remember(coef(pred_model), paste0(remember_prefix, "coef_pred_model")) remember(diag(vcov((pred_model))), paste0(remember_prefix, "se_pred_model")) @@ -19,7 +37,7 @@ compare_dv_models <-function(pred_formula, outcome_formula, proxy_formula, df, s remember(coef(coder_model), paste0(remember_prefix, "coef_coder_model")) remember(diag(vcov((coder_model))), paste0(remember_prefix, "se_coder_model")) - df_measerr_method <- copy(df)[sample(1:.N, sample.prop * .N), toxicity_coded_1 := toxicity_coded] + df_measerr_method <- copy(df)[sample(1:.N, sample.size), toxicity_coded_1 := toxicity_coded] df_measerr_method <- df_measerr_method[,toxicity_coded := toxicity_coded_1] sample_model <- glm(outcome_formula, df_measerr_method, family=binomial(link='logit')) remember(coef(sample_model), paste0(remember_prefix, "coef_sample_model")) @@ -35,20 +53,37 @@ compare_dv_models <-function(pred_formula, outcome_formula, proxy_formula, df, s print("running first example") -compare_dv_models(pred_formula = toxicity_pred ~ funny*white, - outcome_formula = toxicity_coded ~ funny*white, proxy_formula, - proxy_formula = toxicity_pred ~ toxicity_coded*funny*white, +pred_formula = toxicity_pred ~ likes + race_disclosed +outcome_formula = toxicity_coded ~ likes + race_disclosed +proxy_formula = toxicity_pred ~ toxicity_coded*race_disclosed*likes + +compare_dv_models(pred_formula = pred_formula, + outcome_formula = outcome_formula, + proxy_formula = proxy_formula, df=df, sample.prop=0.01, - remember_prefix='cc_ex_tox.funny.white') + sample.size=NULL, + remember_prefix='cc_ex_tox.likes.race_disclosed') print("running second example") -compare_dv_models(pred_formula = toxicity_pred ~ likes+race_disclosed, - outcome_formula = toxicity_coded ~ likes + race_disclosed, proxy_formula, - proxy_formula = toxicity_pred ~ toxicity_coded*likes*race_disclosed, +compare_dv_models(pred_formula = pred_formula, + outcome_formula = outcome_formula, + proxy_formula = proxy_formula, df=df, - sample.prop=0.01, - remember_prefix='cc_ex_tox.funny.race_disclosed') + sample.size=10000, + sample.prop=NULL, + remember_prefix='cc_ex_tox.likes.race_disclosed.medsamp') + + +print("running third example") + +compare_dv_models(pred_formula = pred_formula, + outcome_formula = outcome_formula, + proxy_formula = proxy_formula, + df=df, + sample.prop=0.05, + sample.size=NULL, + remember_prefix='cc_ex_tox.likes.race_disclosed.largesamp')