]> code.communitydata.science - ml_measurement_error_public.git/blobdiff - civil_comments/01_dv_example.R
update real data examples code and rerun project.
[ml_measurement_error_public.git] / civil_comments / 01_dv_example.R
index c26d640bba11cf244bc1d0b76ef8d971df951f34..12af561222f42ce740b6a27e6787eb14d5e688c3 100644 (file)
@@ -3,15 +3,33 @@ source("../simulations/measerr_methods.R")
 source("../simulations/RemembR/R/RemembeR.R")
 
 change.remember.file("dv_perspective_example.RDS")
-
+remember(accuracies, "civil_comments_accuracies")
+remember(f1s, "civil_comments_f1s")
+remember(positive_cases, "civil_comments_positive_cases")
+remember(proportions_cases, "civil_comments_proportions_cases")
+remember(cortab, "civil_comments_cortab")
+remember(nrow(df), 'n.annotated.comments')
 # for reproducibility
-set.seed(1111)
+set.seed(111)
 
 ## another simple enough example: is P(toxic | funny and white) > P(toxic | funny nand white)? Or, are funny comments more toxic when people disclose that they are white?
 
-compare_dv_models <-function(pred_formula, outcome_formula, proxy_formula, df, sample.prop, remember_prefix){
+compare_dv_models <-function(pred_formula, outcome_formula, proxy_formula, df, sample.prop, sample.size, remember_prefix){
+    if(is.null(sample.prop)){
+        sample.prop <- sample.size / nrow(df)
+    }
+    if(is.null(sample.size)){
+        sample.size <- nrow(df) * sample.prop
+    }
+
     pred_model <- glm(pred_formula, df, family=binomial(link='logit'))
 
+    remember(sample.size, paste0(remember_prefix, "sample.size"))
+    remember(sample.prop, paste0(remember_prefix, "sample.prop"))
+    remember(pred_formula, paste0(remember_prefix, "pred_formula"))
+    remember(outcome_formula, paste0(remember_prefix, 'outcome_formula'))
+    remember(proxy_formula, paste0(remember_prefix, 'proxy_formula'))
+
     remember(coef(pred_model), paste0(remember_prefix, "coef_pred_model"))
     remember(diag(vcov((pred_model))), paste0(remember_prefix, "se_pred_model"))
 
@@ -19,7 +37,7 @@ compare_dv_models <-function(pred_formula, outcome_formula, proxy_formula, df, s
     remember(coef(coder_model), paste0(remember_prefix, "coef_coder_model"))
     remember(diag(vcov((coder_model))), paste0(remember_prefix, "se_coder_model"))
 
-    df_measerr_method <- copy(df)[sample(1:.N, sample.prop * .N), toxicity_coded_1 := toxicity_coded]
+    df_measerr_method <- copy(df)[sample(1:.N, sample.size), toxicity_coded_1 := toxicity_coded]
     df_measerr_method <- df_measerr_method[,toxicity_coded := toxicity_coded_1]
     sample_model <- glm(outcome_formula, df_measerr_method, family=binomial(link='logit'))
     remember(coef(sample_model), paste0(remember_prefix, "coef_sample_model"))
@@ -35,20 +53,37 @@ compare_dv_models <-function(pred_formula, outcome_formula, proxy_formula, df, s
 
 print("running first example")
 
-compare_dv_models(pred_formula = toxicity_pred ~ funny*white,
-                  outcome_formula = toxicity_coded ~ funny*white,
-                  proxy_formula = toxicity_pred ~ toxicity_coded*funny*white,
+pred_formula = toxicity_pred ~ likes + race_disclosed
+outcome_formula = toxicity_coded ~ likes + race_disclosed
+proxy_formula = toxicity_pred ~ toxicity_coded*race_disclosed*likes
+
+compare_dv_models(pred_formula = pred_formula,
+                  outcome_formula = outcome_formula,
+                  proxy_formula = proxy_formula,
                   df=df,
                   sample.prop=0.01,
-                  remember_prefix='cc_ex_tox.funny.white')
+                  sample.size=NULL,
+                  remember_prefix='cc_ex_tox.likes.race_disclosed')
 
 
 print("running second example")
 
-compare_dv_models(pred_formula = toxicity_pred ~ likes+race_disclosed,
-                  outcome_formula = toxicity_coded ~ likes + race_disclosed,KKJ
-                  proxy_formula = toxicity_pred ~ toxicity_coded*likes*race_disclosed,
+compare_dv_models(pred_formula = pred_formula,
+                  outcome_formula = outcome_formula,
+                  proxy_formula = proxy_formula,
                   df=df,
-                  sample.prop=0.01,
-                  remember_prefix='cc_ex_tox.funny.race_disclosed')
+                  sample.size=10000,
+                  sample.prop=NULL,
+                  remember_prefix='cc_ex_tox.likes.race_disclosed.medsamp')
+
+
+print("running third example")
+
+compare_dv_models(pred_formula = pred_formula,
+                  outcome_formula = outcome_formula,
+                  proxy_formula = proxy_formula,
+                  df=df,
+                  sample.prop=0.05,
+                  sample.size=NULL,
+                  remember_prefix='cc_ex_tox.likes.race_disclosed.largesamp')
 

Community Data Science Collective || Want to submit a patch?