check in some old simulation updates and a dv examples with real data

[ml_measurement_error_public.git] / civil_comments / load_perspective_data.R
diff --git a/civil_comments/load_perspective_data.R b/civil_comments/load_perspective_data.R

new file mode 100644 (file)

index 0000000..636c423
--- /dev/null
+++ b/civil_comments/load_perspective_data.R
@@ -0,0 +1,41 @@
+library(data.table)
+library(MASS)
+
+set.seed(1111)
+
+scores <- fread("perspective_scores.csv")
+scores <- scores[,id:=as.character(id)]
+
+df <- fread("all_data.csv")
+
+# only use the data that has identity annotations
+df <- df[identity_annotator_count > 0]
+
+(df[!(df$id %in% scores$id)])
+
+df <- df[scores,on='id',nomatch=NULL]
+
+df[, ":="(identity_attack_pred = identity_attack_prob >=0.5,
+          insult_pred = insult_prob >= 0.5,
+          profanity_pred = profanity_prob >= 0.5,
+          severe_toxicity_pred = severe_toxicity_prob >= 0.5,
+          threat_pred = threat_prob >= 0.5,
+          toxicity_pred = toxicity_prob >= 0.5,
+          identity_attack_coded = identity_attack >= 0.5,
+          insult_coded = insult >= 0.5,
+          profanity_coded = obscene >= 0.5,
+          severe_toxicity_coded = severe_toxicity >= 0.5,
+          threat_coded = threat >= 0.5,
+          toxicity_coded = toxicity >= 0.5
+          )]
+
+gt.0.5 <- function(v) { v >= 0.5 }
+dt.apply.any <- function(fun, ...){apply(apply(cbind(...), 2, fun),1,any)}
+
+df <- df[,":="(gender_disclosed = dt.apply.any(gt.0.5, male, female, transgender, other_gender),
+               sexuality_disclosed = dt.apply.any(gt.0.5, heterosexual, bisexual, other_sexual_orientation),
+               religion_disclosed = dt.apply.any(gt.0.5, christian, jewish, hindu, buddhist, atheist, muslim, other_religion),
+               race_disclosed = dt.apply.any(gt.0.5, white, black, asian, latino, other_race_or_ethnicity), 
+               disability_disclosed = dt.apply.any(gt.0.5,physical_disability, intellectual_or_learning_disability, psychiatric_or_mental_illness, other_disability))]
+
+df <- df[,white:=gt.0.5(white)]