From c45ea9dfebca86dfddc1e9237aa74866c5166519 Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Wed, 11 Jan 2023 10:59:50 -0800 Subject: [PATCH 1/1] update real data examples code and rerun project. --- civil_comments/01_dv_example.R | 61 +++++++++++--- civil_comments/02_iv_example.R | 107 +++++++++++++++++++++++++ civil_comments/Makefile | 8 ++ civil_comments/design_example.R | 105 ------------------------ civil_comments/load_perspective_data.R | 95 ++++++++++++++++++++++ simulations/measerr_methods.R | 98 +++++++++++----------- simulations/run_job.sbatch | 2 +- 7 files changed, 306 insertions(+), 170 deletions(-) create mode 100644 civil_comments/02_iv_example.R diff --git a/civil_comments/01_dv_example.R b/civil_comments/01_dv_example.R index c26d640..12af561 100644 --- a/civil_comments/01_dv_example.R +++ b/civil_comments/01_dv_example.R @@ -3,15 +3,33 @@ source("../simulations/measerr_methods.R") source("../simulations/RemembR/R/RemembeR.R") change.remember.file("dv_perspective_example.RDS") - +remember(accuracies, "civil_comments_accuracies") +remember(f1s, "civil_comments_f1s") +remember(positive_cases, "civil_comments_positive_cases") +remember(proportions_cases, "civil_comments_proportions_cases") +remember(cortab, "civil_comments_cortab") +remember(nrow(df), 'n.annotated.comments') # for reproducibility -set.seed(1111) +set.seed(111) ## another simple enough example: is P(toxic | funny and white) > P(toxic | funny nand white)? Or, are funny comments more toxic when people disclose that they are white? -compare_dv_models <-function(pred_formula, outcome_formula, proxy_formula, df, sample.prop, remember_prefix){ +compare_dv_models <-function(pred_formula, outcome_formula, proxy_formula, df, sample.prop, sample.size, remember_prefix){ + if(is.null(sample.prop)){ + sample.prop <- sample.size / nrow(df) + } + if(is.null(sample.size)){ + sample.size <- nrow(df) * sample.prop + } + pred_model <- glm(pred_formula, df, family=binomial(link='logit')) + remember(sample.size, paste0(remember_prefix, "sample.size")) + remember(sample.prop, paste0(remember_prefix, "sample.prop")) + remember(pred_formula, paste0(remember_prefix, "pred_formula")) + remember(outcome_formula, paste0(remember_prefix, 'outcome_formula')) + remember(proxy_formula, paste0(remember_prefix, 'proxy_formula')) + remember(coef(pred_model), paste0(remember_prefix, "coef_pred_model")) remember(diag(vcov((pred_model))), paste0(remember_prefix, "se_pred_model")) @@ -19,7 +37,7 @@ compare_dv_models <-function(pred_formula, outcome_formula, proxy_formula, df, s remember(coef(coder_model), paste0(remember_prefix, "coef_coder_model")) remember(diag(vcov((coder_model))), paste0(remember_prefix, "se_coder_model")) - df_measerr_method <- copy(df)[sample(1:.N, sample.prop * .N), toxicity_coded_1 := toxicity_coded] + df_measerr_method <- copy(df)[sample(1:.N, sample.size), toxicity_coded_1 := toxicity_coded] df_measerr_method <- df_measerr_method[,toxicity_coded := toxicity_coded_1] sample_model <- glm(outcome_formula, df_measerr_method, family=binomial(link='logit')) remember(coef(sample_model), paste0(remember_prefix, "coef_sample_model")) @@ -35,20 +53,37 @@ compare_dv_models <-function(pred_formula, outcome_formula, proxy_formula, df, s print("running first example") -compare_dv_models(pred_formula = toxicity_pred ~ funny*white, - outcome_formula = toxicity_coded ~ funny*white, - proxy_formula = toxicity_pred ~ toxicity_coded*funny*white, +pred_formula = toxicity_pred ~ likes + race_disclosed +outcome_formula = toxicity_coded ~ likes + race_disclosed +proxy_formula = toxicity_pred ~ toxicity_coded*race_disclosed*likes + +compare_dv_models(pred_formula = pred_formula, + outcome_formula = outcome_formula, + proxy_formula = proxy_formula, df=df, sample.prop=0.01, - remember_prefix='cc_ex_tox.funny.white') + sample.size=NULL, + remember_prefix='cc_ex_tox.likes.race_disclosed') print("running second example") -compare_dv_models(pred_formula = toxicity_pred ~ likes+race_disclosed, - outcome_formula = toxicity_coded ~ likes + race_disclosed,KKJ - proxy_formula = toxicity_pred ~ toxicity_coded*likes*race_disclosed, +compare_dv_models(pred_formula = pred_formula, + outcome_formula = outcome_formula, + proxy_formula = proxy_formula, df=df, - sample.prop=0.01, - remember_prefix='cc_ex_tox.funny.race_disclosed') + sample.size=10000, + sample.prop=NULL, + remember_prefix='cc_ex_tox.likes.race_disclosed.medsamp') + + +print("running third example") + +compare_dv_models(pred_formula = pred_formula, + outcome_formula = outcome_formula, + proxy_formula = proxy_formula, + df=df, + sample.prop=0.05, + sample.size=NULL, + remember_prefix='cc_ex_tox.likes.race_disclosed.largesamp') diff --git a/civil_comments/02_iv_example.R b/civil_comments/02_iv_example.R new file mode 100644 index 0000000..ba5ab12 --- /dev/null +++ b/civil_comments/02_iv_example.R @@ -0,0 +1,107 @@ +source("../simulations/RemembR/R/RemembeR.R") +change.remember.file("iv_perspective_example.RDS") + +source('load_perspective_data.R') +source("../simulations/measerr_methods.R") + +remember(accuracies, "civil_comments_accuracies") +remember(f1s, "civil_comments_f1s") +remember(positive_cases, "civil_comments_positive_cases") +remember(proportions_cases, "civil_comments_proportions_cases") +remember(cortab, "civil_comments_cortab") +remember(nrow(df), 'n.annotated.comments') +# for reproducibility +set.seed(1) + +## another simple enough example: is P(toxic | funny and white) > P(toxic | funny nand white)? Or, are funny comments more toxic when people disclose that they are white? + +compare_iv_models <-function(pred_formula, outcome_formula, proxy_formula, truth_formula, df, sample.prop, sample.size, remember_prefix){ + + if(is.null(sample.prop)){ + sample.prop <- sample.size / nrow(df) + } + if(is.null(sample.size)){ + sample.size <- nrow(df) * sample.prop + } + + remember(sample.size, paste0(remember_prefix, "sample.size")) + remember(sample.prop, paste0(remember_prefix, "sample.prop")) + remember(pred_formula, paste0(remember_prefix, "pred_formula")) + remember(outcome_formula, paste0(remember_prefix, 'outcome_formula')) + remember(proxy_formula, paste0(remember_prefix, 'proxy_formula')) + remember(truth_formula, paste0(remember_prefix, 'truth_formula')) + + pred_model <- glm(pred_formula, df, family=binomial(link='logit')) + remember(coef(pred_model), paste0(remember_prefix, "coef_pred_model")) + remember(diag(vcov((pred_model))), paste0(remember_prefix, "se_pred_model")) + + coder_model <- glm(outcome_formula, df, family=binomial(link='logit')) + remember(coef(coder_model), paste0(remember_prefix, "coef_coder_model")) + remember(diag(vcov((coder_model))), paste0(remember_prefix, "se_coder_model")) + + df_measerr_method <- copy(df)[sample(1:.N, sample.size), toxicity_coded_1 := toxicity_coded] + df_measerr_method <- df_measerr_method[,toxicity_coded := toxicity_coded_1] + sample_model <- glm(outcome_formula, df_measerr_method, family=binomial(link='logit')) + remember(coef(sample_model), paste0(remember_prefix, "coef_sample_model")) + remember(diag(vcov((sample_model))), paste0(remember_prefix, "se_sample_model")) + + measerr_model <- measerr_mle(df_measerr_method, outcome_formula, outcome_family=binomial(link='logit'), proxy_formula=proxy_formula, proxy_family=binomial(link='logit'),truth_formula=truth_formula, truth_family=binomial(link='logit')) + + inv_hessian = solve(measerr_model$hessian) + stderr = diag(inv_hessian) + remember(stderr, paste0(remember_prefix, "measerr_model_stderr")) + remember(measerr_model$par, paste0(remember_prefix, "measerr_model_par")) +} + +## print("running first iv example") + +## sample.prop <- 0.05 + +## compare_iv_models(white ~ toxicity_pred*funny, +## outcome_formula = white ~ toxicity_coded*funny, +## proxy_formula = toxicity_pred ~ toxicity_coded*funny*white, +## truth_formula = toxicity_coded ~ 1, +## df=df, +## sample.prop=sample.prop, +## remember_prefix='cc_ex_tox.funny.white') + + + +pred_formula <- race_disclosed ~ likes * toxicity_pred +outcome_formula <- race_disclosed ~ likes * toxicity_coded +proxy_formula <- toxicity_pred ~ toxicity_coded * race_disclosed * likes +truth_formula <- toxicity_coded ~ 1 + +print("running first example") + +compare_iv_models(pred_formula = pred_formula, + outcome_formula = outcome_formula, + proxy_formula = proxy_formula, + truth_formula = truth_formula, + df=df, + sample.prop=0.01, + sample.size=NULL, + remember_prefix='cc_ex_tox.likes.race_disclosed') + +print("running second example") + +compare_iv_models(pred_formula = pred_formula, + outcome_formula = outcome_formula, + proxy_formula = proxy_formula, + truth_formula = truth_formula, + df=df, + sample.prop=NULL, + sample.size=10000, + remember_prefix='cc_ex_tox.likes.race_disclosed.medsamp') + +print("running third example") + +compare_iv_models(pred_formula = race_disclosed ~ likes * toxicity_pred, + outcome_formula = race_disclosed ~ likes * toxicity_coded, + proxy_formula = toxicity_pred ~ toxicity_coded + race_disclosed, + truth_formula = toxicity_coded ~ 1, + df=df, + sample.prop=0.05, + sample.size=NULL, + remember_prefix='cc_ex_tox.likes.race_disclosed.largesamp') + diff --git a/civil_comments/Makefile b/civil_comments/Makefile index 3c331c2..ab05a35 100644 --- a/civil_comments/Makefile +++ b/civil_comments/Makefile @@ -1,6 +1,14 @@ +qall: iv_perspective_example.RDS dv_perspective_example.RDS + srun_1core=srun -A comdata -p compute-bigmem --mem=4G --time=12:00:00 -c 1 --pty /usr/bin/bash -l +srun=srun -A comdata -p compute-bigmem --mem=4G --time=12:00:00 --pty /usr/bin/bash -l + perspective_scores.csv: perspective_json_to_csv.sh perspective_results.json $(srun_1core) ./$^ $@ +iv_perspective_example.RDS: 02_iv_example.R perspective_scores.csv + $(srun) Rscript $< +dv_perspective_example.RDS: 01_dv_example.R perspective_scores.csv + $(srun) Rscript $< diff --git a/civil_comments/design_example.R b/civil_comments/design_example.R index 1a83a81..4907688 100644 --- a/civil_comments/design_example.R +++ b/civil_comments/design_example.R @@ -5,95 +5,6 @@ source('load_perspective_data.R') ## the API claims that these scores are "probabilities" ## say we care about the model of the classification, not the probability -F1 <- function(y, predictions){ - tp <- sum( (predictions == y) & (predictions==1)) - fn <- sum( (predictions != y) & (predictions!=1)) - fp <- sum( (predictions != y) & (predictions==1)) - precision <- tp / (tp + fp) - recall <- tp / (tp + fn) - return (2 * precision * recall ) / (precision + recall) -} - - -## toxicity is about 93% accurate, with an f1 of 0.8 -## identity_attack has high accuracy 97%, but an unfortunant f1 of 0.5. -## threat has high accuracy 99%, but a really bad looking f1 of 0.48. -accuracies <- df[,.(identity_attack_acc = mean(identity_attack_pred == identity_attack_coded), - insult_pred_acc = mean(insult_pred == insult_coded), - profanity_acc = mean(profanity_pred == profanity_coded), - severe_toxicity_acc = mean(severe_toxicity_pred == severe_toxicity_coded), - theat_acc = mean(threat_pred == threat_coded), - toxicity_acc = mean(toxicity_pred == toxicity_coded))] - -f1s <- df[,.(identity_attack_f1 = F1(identity_attack_coded,identity_attack_pred), - insult_f1 = F1(insult_coded,insult_pred), - profanity_f1 = F1(profanity_coded,profanity_pred), - severe_toxicity_f1 = F1(severe_toxicity_coded,severe_toxicity_pred), - theat_f1 = F1(threat_coded,threat_pred), - toxicity_f1 = F1(toxicity_coded,toxicity_pred))] - -positive_cases <- df[,.(identity_attacks = sum(identity_attack_coded), - insults = sum(insult_coded), - profanities = sum(profanity_coded), - severe_toxic_comments = sum(severe_toxicity_coded), - threats = sum(threat_coded), - toxic_comments = sum(toxicity_coded))] - -## there are 50,000 toxic comments, 13000 identity attacks, 30000 insults, 3000 profanities, 8 severe toxic, and 1000 threats. - -proportions_cases <- df[,.(prop_identity = mean(identity_attack_coded), - prop_insults = mean(insult_coded), - prop_profanity = mean(profanity_coded), - prop_severe = mean(severe_toxicity_coded), - prop_threats = mean(threat_coded), - prop_toxic = mean(toxicity_coded))] - -## at 11% of comments, "toxicity" seems not so badly skewed. Try toxicity first, and if it doesn't work out try insults. - -## now look for an example where differential error affects an identity, or a reaction. -df <- df[,":="(identity_error = identity_attack_coded - identity_attack_pred, - insult_error = insult_coded - insult_pred, - profanity_error = profanity_coded - profanity_pred, - severe_toxic_error = severe_toxicity_coded - severe_toxicity_pred, - threat_error = threat_coded - threat_pred, - toxicity_error = toxicity_coded - toxicity_pred)] - -## what's correlated with toxicity_error ? -df <- df[,approved := rating == "approved"] -df <- df[,white := white > 0.5] - -cortab <- cor(df[,.(toxicity_error, - identity_error, - toxicity_coded, - funny, - approved, - sad, - wow, - likes, - disagree, - male, - female, - transgender, - other_gender, - heterosexual, - bisexual, - other_sexual_orientation, - christian, - jewish, - hindu, - buddhist, - atheist, - other_religion, - black, - white, - asian, - latino, - other_race_or_ethnicity, - physical_disability, - intellectual_or_learning_disability, - psychiatric_or_mental_illness, - other_disability)]) - ## toxicity error is weakly correlated pearson's R = 0.1 with both "white" and "black". ## compare regressions with "white" or "black" as the outcome and "toxicity_coded" or "toxicity_pred" as a predictor. @@ -107,22 +18,6 @@ cortab['toxicity_coded',] cortab['identity_error',] cortab['white',] -cortab <- cor(df[,.(toxicity_error, - identity_error, - toxicity_coded, - funny, - approved, - sad, - wow, - likes, - disagree, - gender_disclosed, - sexuality_disclosed, - religion_disclosed, - race_disclosed, - disability_disclosed)]) - - ## here's a simple example, is P(white | toxic and mentally ill) > P(white | toxic or mentally ill). Are people who discuss their mental illness in a toxic way more likely to be white compared to those who just talk about their mental illness or are toxic? summary(glm(white ~ toxicity_coded*psychiatric_or_mental_illness, data = df, family=binomial(link='logit'))) diff --git a/civil_comments/load_perspective_data.R b/civil_comments/load_perspective_data.R index 636c423..d6ef492 100644 --- a/civil_comments/load_perspective_data.R +++ b/civil_comments/load_perspective_data.R @@ -39,3 +39,98 @@ df <- df[,":="(gender_disclosed = dt.apply.any(gt.0.5, male, female, transgender disability_disclosed = dt.apply.any(gt.0.5,physical_disability, intellectual_or_learning_disability, psychiatric_or_mental_illness, other_disability))] df <- df[,white:=gt.0.5(white)] + + +F1 <- function(y, predictions){ + tp <- sum( (predictions == y) & (predictions==1)) + fn <- sum( (predictions != y) & (predictions!=1)) + fp <- sum( (predictions != y) & (predictions==1)) + precision <- tp / (tp + fp) + recall <- tp / (tp + fn) + return (2 * precision * recall ) / (precision + recall) +} + + +## toxicity is about 93% accurate, with an f1 of 0.8 +## identity_attack has high accuracy 97%, but an unfortunant f1 of 0.5. +## threat has high accuracy 99%, but a really bad looking f1 of 0.48. +accuracies <- df[,.(identity_attack_acc = mean(identity_attack_pred == identity_attack_coded), + insult_pred_acc = mean(insult_pred == insult_coded), + profanity_acc = mean(profanity_pred == profanity_coded), + severe_toxicity_acc = mean(severe_toxicity_pred == severe_toxicity_coded), + theat_acc = mean(threat_pred == threat_coded), + toxicity_acc = mean(toxicity_pred == toxicity_coded))] + +f1s <- df[,.(identity_attack_f1 = F1(identity_attack_coded,identity_attack_pred), + insult_f1 = F1(insult_coded,insult_pred), + profanity_f1 = F1(profanity_coded,profanity_pred), + severe_toxicity_f1 = F1(severe_toxicity_coded,severe_toxicity_pred), + theat_f1 = F1(threat_coded,threat_pred), + toxicity_f1 = F1(toxicity_coded,toxicity_pred))] + +positive_cases <- df[,.(identity_attacks = sum(identity_attack_coded), + insults = sum(insult_coded), + profanities = sum(profanity_coded), + severe_toxic_comments = sum(severe_toxicity_coded), + threats = sum(threat_coded), + toxic_comments = sum(toxicity_coded))] + +## there are 50,000 toxic comments, 13000 identity attacks, 30000 insults, 3000 profanities, 8 severe toxic, and 1000 threats. + +proportions_cases <- df[,.(prop_identity = mean(identity_attack_coded), + prop_insults = mean(insult_coded), + prop_profanity = mean(profanity_coded), + prop_severe = mean(severe_toxicity_coded), + prop_threats = mean(threat_coded), + prop_toxic = mean(toxicity_coded))] + +## at 11% of comments, "toxicity" seems not so badly skewed. Try toxicity first, and if it doesn't work out try insults. + +## now look for an example where differential error affects an identity, or a reaction. +df <- df[,":="(identity_error = identity_attack_coded - identity_attack_pred, + insult_error = insult_coded - insult_pred, + profanity_error = profanity_coded - profanity_pred, + severe_toxic_error = severe_toxicity_coded - severe_toxicity_pred, + threat_error = threat_coded - threat_pred, + toxicity_error = toxicity_coded - toxicity_pred)] + +## what's correlated with toxicity_error ? +df <- df[,approved := rating == "approved"] +df <- df[,white := white > 0.5] + +cortab <- cor(df[,.(toxicity_error, + identity_error, + toxicity_coded, + funny, + approved, + sad, + wow, + likes, + disagree, + male, + female, + transgender, + other_gender, + heterosexual, + bisexual, + other_sexual_orientation, + christian, + jewish, + hindu, + buddhist, + atheist, + other_religion, + black, + white, + asian, + latino, + other_race_or_ethnicity, + physical_disability, + intellectual_or_learning_disability, + psychiatric_or_mental_illness, + other_disability, + gender_disclosed, + sexuality_disclosed, + religion_disclosed, + race_disclosed, + disability_disclosed)]) diff --git a/simulations/measerr_methods.R b/simulations/measerr_methods.R index fdc4978..92309ed 100644 --- a/simulations/measerr_methods.R +++ b/simulations/measerr_methods.R @@ -15,7 +15,12 @@ library(bbmle) ### ideal formulas for example 2 # test.fit.2 <- measerr_mle(df, y ~ x + z, gaussian(), w_pred ~ x + z + y + y:x, binomial(link='logit'), x ~ z) - +likelihood.logistic <- function(model.params, outcome, model.matrix){ + ll <- vector(mode='numeric', length=length(outcome)) + ll[outcome == 1] <- plogis(model.params %*% t(model.matrix[outcome==1,]), log=TRUE) + ll[outcome == 0] <- plogis(model.params %*% t(model.matrix[outcome==0,]), log=TRUE, lower.tail=FALSE) + return(ll) +} ## outcome_formula <- y ~ x + z; proxy_formula <- w_pred ~ y + x + z + x:z + x:y + z:y measerr_mle_dv <- function(df, outcome_formula, outcome_family=binomial(link='logit'), proxy_formula, proxy_family=binomial(link='logit'),method='optim'){ @@ -126,6 +131,31 @@ measerr_mle <- function(df, outcome_formula, outcome_family=gaussian(), proxy_fo proxy.model.matrix <- model.matrix(proxy_formula, df) y.obs <- with(df.obs,eval(parse(text=response.var))) + df.proxy.obs <- model.frame(proxy_formula,df) + proxy.obs <- with(df.proxy.obs, eval(parse(text=proxy.variable))) + n.proxy.model.covars <- dim(proxy.model.matrix)[2] + + df.truth.obs <- model.frame(truth_formula, df) + truth.obs <- with(df.truth.obs, eval(parse(text=truth.variable))) + truth.model.matrix <- model.matrix(truth_formula,df.truth.obs) + n.truth.model.covars <- dim(truth.model.matrix)[2] + + df.unobs <- df[is.na(eval(parse(text=truth.variable)))] + df.unobs.x1 <- copy(df.unobs) + df.unobs.x1[,truth.variable] <- 1 + df.unobs.x0 <- copy(df.unobs) + df.unobs.x0[,truth.variable] <- 0 + outcome.unobs <- with(df.unobs, eval(parse(text=response.var))) + + outcome.model.matrix.x0 <- model.matrix(outcome_formula, df.unobs.x0) + outcome.model.matrix.x1 <- model.matrix(outcome_formula, df.unobs.x1) + + proxy.model.matrix.x0 <- model.matrix(proxy_formula, df.unobs.x0) + proxy.model.matrix.x1 <- model.matrix(proxy_formula, df.unobs.x1) + proxy.unobs <- df.unobs[[proxy.variable]] + + truth.model.matrix.unobs <- model.matrix(truth_formula, df.unobs.x0) + measerr_mle_nll <- function(params){ param.idx <- 1 n.outcome.model.covars <- dim(outcome.model.matrix)[2] @@ -138,82 +168,48 @@ measerr_mle <- function(df, outcome_formula, outcome_family=gaussian(), proxy_fo param.idx <- param.idx + 1 # outcome_formula likelihood using linear regression ll.y.obs <- dnorm(y.obs, outcome.params %*% t(outcome.model.matrix),sd=sigma.y, log=TRUE) - } - - df.obs <- model.frame(proxy_formula,df) - n.proxy.model.covars <- dim(proxy.model.matrix)[2] + } else if( (outcome_family$family == "binomial") & (outcome_family$link == "logit") ) + ll.y.obs <- likelihood.logistic(outcome.params, y.obs, outcome.model.matrix) + + proxy.params <- params[param.idx:(n.proxy.model.covars+param.idx-1)] param.idx <- param.idx + n.proxy.model.covars - proxy.obs <- with(df.obs, eval(parse(text=proxy.variable))) - if( (proxy_family$family=="binomial") & (proxy_family$link=='logit')){ - ll.w.obs <- vector(mode='numeric',length=dim(proxy.model.matrix)[1]) - - # proxy_formula likelihood using logistic regression - ll.w.obs[proxy.obs==1] <- plogis(proxy.params %*% t(proxy.model.matrix[proxy.obs==1,]),log=TRUE) - ll.w.obs[proxy.obs==0] <- plogis(proxy.params %*% t(proxy.model.matrix[proxy.obs==0,]),log=TRUE, lower.tail=FALSE) - } - - df.obs <- model.frame(truth_formula, df) + if( (proxy_family$family=="binomial") & (proxy_family$link=='logit')) + ll.w.obs <- likelihood.logistic(proxy.params, proxy.obs, proxy.model.matrix) - truth.obs <- with(df.obs, eval(parse(text=truth.variable))) - truth.model.matrix <- model.matrix(truth_formula,df) - n.truth.model.covars <- dim(truth.model.matrix)[2] - truth.params <- params[param.idx:(n.truth.model.covars + param.idx - 1)] - if( (truth_family$family=="binomial") & (truth_family$link=='logit')){ - ll.x.obs <- vector(mode='numeric',length=dim(truth.model.matrix)[1]) - - # truth_formula likelihood using logistic regression - ll.x.obs[truth.obs==1] <- plogis(truth.params %*% t(truth.model.matrix[truth.obs==1,]),log=TRUE) - ll.x.obs[truth.obs==0] <- plogis(truth.params %*% t(truth.model.matrix[truth.obs==0,]),log=TRUE, lower.tail=FALSE) - } + if( (truth_family$family=="binomial") & (truth_family$link=='logit')) + ll.x.obs <- likelihood.logistic(truth.params, truth.obs, truth.model.matrix) - # add the three likelihoods + # add the three likelihoods ll.obs <- sum(ll.y.obs + ll.w.obs + ll.x.obs) ## likelihood for the predicted data ## integrate out the "truth" variable. if(truth_family$family=='binomial'){ - df.unobs <- df[is.na(eval(parse(text=truth.variable)))] - df.unobs.x1 <- copy(df.unobs) - df.unobs.x1[,'x'] <- 1 - df.unobs.x0 <- copy(df.unobs) - df.unobs.x0[,'x'] <- 0 - outcome.unobs <- with(df.unobs, eval(parse(text=response.var))) - - outcome.model.matrix.x0 <- model.matrix(outcome_formula, df.unobs.x0) - outcome.model.matrix.x1 <- model.matrix(outcome_formula, df.unobs.x1) if(outcome_family$family=="gaussian"){ - # likelihood of outcome ll.y.x0 <- dnorm(outcome.unobs, outcome.params %*% t(outcome.model.matrix.x0), sd=sigma.y, log=TRUE) ll.y.x1 <- dnorm(outcome.unobs, outcome.params %*% t(outcome.model.matrix.x1), sd=sigma.y, log=TRUE) + } else if( (outcome_family$family == "binomial") & (outcome_family$link == "logit") ){ + ll.y.x1 <- likelihood.logistic(outcome.params, outcome.unobs, outcome.model.matrix.x1) + ll.y.x0 <- likelihood.logistic(outcome.params, outcome.unobs, outcome.model.matrix.x0) } if( (proxy_family$family=='binomial') & (proxy_family$link=='logit')){ - proxy.model.matrix.x0 <- model.matrix(proxy_formula, df.unobs.x0) - proxy.model.matrix.x1 <- model.matrix(proxy_formula, df.unobs.x1) - proxy.unobs <- df.unobs[[proxy.variable]] - ll.w.x0 <- vector(mode='numeric', length=dim(df.unobs)[1]) - ll.w.x1 <- vector(mode='numeric', length=dim(df.unobs)[1]) - - # likelihood of proxy - ll.w.x0[proxy.unobs==1] <- plogis(proxy.params %*% t(proxy.model.matrix.x0[proxy.unobs==1,]), log=TRUE) - ll.w.x1[proxy.unobs==1] <- plogis(proxy.params %*% t(proxy.model.matrix.x1[proxy.unobs==1,]), log=TRUE) + ll.w.x0 <- likelihood.logistic(proxy.params, proxy.unobs, proxy.model.matrix.x0) + ll.w.x1 <- likelihood.logistic(proxy.params, proxy.unobs, proxy.model.matrix.x1) - ll.w.x0[proxy.unobs==0] <- plogis(proxy.params %*% t(proxy.model.matrix.x0[proxy.unobs==0,]), log=TRUE,lower.tail=FALSE) - ll.w.x1[proxy.unobs==0] <- plogis(proxy.params %*% t(proxy.model.matrix.x1[proxy.unobs==0,]), log=TRUE,lower.tail=FALSE) } if(truth_family$link=='logit'){ - truth.model.matrix <- model.matrix(truth_formula, df.unobs.x0) # likelihood of truth - ll.x.x1 <- plogis(truth.params %*% t(truth.model.matrix), log=TRUE) - ll.x.x0 <- plogis(truth.params %*% t(truth.model.matrix), log=TRUE, lower.tail=FALSE) + ll.x.x1 <- plogis(truth.params %*% t(truth.model.matrix.unobs), log=TRUE) + ll.x.x0 <- plogis(truth.params %*% t(truth.model.matrix.unobs), log=TRUE, lower.tail=FALSE) } } diff --git a/simulations/run_job.sbatch b/simulations/run_job.sbatch index 3ff4f80..8561788 100644 --- a/simulations/run_job.sbatch +++ b/simulations/run_job.sbatch @@ -2,7 +2,7 @@ #SBATCH --job-name="simulate measurement error models" ## Allocation Definition #SBATCH --account=comdata -#SBATCH --partition=compute-bigmem,compute-hugemem +#SBATCH --partition=compute-bigmem ## Resources #SBATCH --nodes=1 ## Walltime (4 hours) -- 2.39.5