]> code.communitydata.science - ml_measurement_error_public.git/blob - civil_comments/01_dv_example.R
c26d640bba11cf244bc1d0b76ef8d971df951f34
[ml_measurement_error_public.git] / civil_comments / 01_dv_example.R
1 source('load_perspective_data.R')
2 source("../simulations/measerr_methods.R")
3 source("../simulations/RemembR/R/RemembeR.R")
4
5 change.remember.file("dv_perspective_example.RDS")
6
7 # for reproducibility
8 set.seed(1111)
9
10 ## another simple enough example: is P(toxic | funny and white) > P(toxic | funny nand white)? Or, are funny comments more toxic when people disclose that they are white?
11
12 compare_dv_models <-function(pred_formula, outcome_formula, proxy_formula, df, sample.prop, remember_prefix){
13     pred_model <- glm(pred_formula, df, family=binomial(link='logit'))
14
15     remember(coef(pred_model), paste0(remember_prefix, "coef_pred_model"))
16     remember(diag(vcov((pred_model))), paste0(remember_prefix, "se_pred_model"))
17
18     coder_model <- glm(outcome_formula, df, family=binomial(link='logit'))
19     remember(coef(coder_model), paste0(remember_prefix, "coef_coder_model"))
20     remember(diag(vcov((coder_model))), paste0(remember_prefix, "se_coder_model"))
21
22     df_measerr_method <- copy(df)[sample(1:.N, sample.prop * .N), toxicity_coded_1 := toxicity_coded]
23     df_measerr_method <- df_measerr_method[,toxicity_coded := toxicity_coded_1]
24     sample_model <- glm(outcome_formula, df_measerr_method, family=binomial(link='logit'))
25     remember(coef(sample_model), paste0(remember_prefix, "coef_sample_model"))
26     remember(diag(vcov((sample_model))), paste0(remember_prefix, "se_sample_model"))
27
28     measerr_model <- measerr_mle_dv(df_measerr_method, outcome_formula, outcome_family=binomial(link='logit'), proxy_formula=proxy_formula, proxy_family=binomial(link='logit'))
29
30     inv_hessian = solve(measerr_model$hessian)
31     stderr = diag(inv_hessian)
32     remember(stderr, paste0(remember_prefix, "measerr_model_stderr"))
33     remember(measerr_model$par, paste0(remember_prefix, "measerr_model_par"))
34 }
35
36 print("running first example")
37
38 compare_dv_models(pred_formula = toxicity_pred ~ funny*white,
39                   outcome_formula = toxicity_coded ~ funny*white,
40                   proxy_formula = toxicity_pred ~ toxicity_coded*funny*white,
41                   df=df,
42                   sample.prop=0.01,
43                   remember_prefix='cc_ex_tox.funny.white')
44
45
46 print("running second example")
47
48 compare_dv_models(pred_formula = toxicity_pred ~ likes+race_disclosed,
49                   outcome_formula = toxicity_coded ~ likes + race_disclosed,KKJ
50                   proxy_formula = toxicity_pred ~ toxicity_coded*likes*race_disclosed,
51                   df=df,
52                   sample.prop=0.01,
53                   remember_prefix='cc_ex_tox.funny.race_disclosed')
54

Community Data Science Collective || Want to submit a patch?