simulations/simulation_base.R

   1 library(predictionError)
   2 library(mecor)
   3 options(amelia.parallel="no",
   4         amelia.ncpus=1)
   5 library(Amelia)
   6 library(Zelig)
   7 library(stats4)
   8
   9
  10 ## This uses the pseudolikelihood approach from Carroll page 349.
  11 ## assumes MAR
  12 ## assumes differential error, but that only depends on Y
  13 ## inefficient, because pseudolikelihood
  14 logistic.correction.pseudo <- function(df){
  15     p1.est <- mean(df[w_pred==1]$y.obs==1,na.rm=T)
  16     p0.est <- mean(df[w_pred==0]$y.obs==0,na.rm=T)
  17
  18     nll <- function(B0, Bxy, Bgy){
  19         probs <- (1 - p0.est) + (p1.est + p0.est - 1)*plogis(B0 + Bxy * df$x + Bgy * df$g)
  20
  21         part1 = sum(log(probs[df$w_pred == 1]))
  22         part2 = sum(log(1-probs[df$w_pred == 0]))
  23
  24         return(-1*(part1 + part2))
  25     }
  26
  27     mlefit <- stats4::mle(minuslogl = nll, start = list(B0=0, Bxy=0.0, Bgy=0.0))
  28     return(mlefit)
  29
  30 }
  31
  32 ## This uses the likelihood approach from Carroll page 353.
  33 ## assumes that we have a good measurement error model
  34 logistic.correction.liklihood <- function(df){
  35
  36     ## liklihood for observed responses
  37     nll <- function(B0, Bxy, Bgy, gamma0, gamma_y, gamma_g){
  38         df.obs <- df[!is.na(y.obs)]
  39         p.y.obs <- plogis(B0 + Bxy * df.obs$x + Bgy*df.obs$g)
  40         p.y.obs[df.obs$y==0] <- 1-p.y.obs[df.obs$y==0]
  41         p.s.obs <- plogis(gamma0 + gamma_y * df.obs$y + gamma_g*df.obs$g)
  42         p.s.obs[df.obs$w_pred==0] <- 1 - p.s.obs[df.obs$w_pred==0]
  43
  44         p.obs <- p.s.obs * p.y.obs
  45
  46         df.unobs <- df[is.na(y.obs)]
  47
  48         p.unobs.1 <- plogis(B0 + Bxy * df.unobs$x + Bgy*df.unobs$g)*plogis(gamma0 + gamma_y + gamma_g*df.unobs$g)
  49         p.unobs.0 <- (1-plogis(B0 + Bxy * df.unobs$x + Bgy*df.unobs$g))*plogis(gamma0 + gamma_g*df.unobs$g)
  50         p.unobs <- p.unobs.1 + p.unobs.0
  51         p.unobs[df.unobs$w_pred==0] <- 1 - p.unobs[df.unobs$w_pred==0]
  52
  53         p <- c(p.obs, p.unobs)
  54
  55         return(-1*(sum(log(p))))
  56     }
  57
  58     mlefit <- stats4::mle(minuslogl = nll, start = list(B0=1, Bxy=0,Bgy=0, gamma0=5, gamma_y=0, gamma_g=0))
  59
  60     return(mlefit)
  61 }
  62
  63
  64 logistic <- function(x) {1/(1+exp(-1*x))}
  65
  66 run_simulation_depvar <- function(df, result){
  67
  68     accuracy <- df[,mean(w_pred==y)]
  69     result <- append(result, list(accuracy=accuracy))
  70
  71     (model.true <- glm(y ~ x + g, data=df,family=binomial(link='logit')))
  72     true.ci.Bxy <- confint(model.true)['x',]
  73     true.ci.Bgy <- confint(model.true)['g',]
  74
  75     result <- append(result, list(Bxy.est.true=coef(model.true)['x'],
  76                                   Bgy.est.true=coef(model.true)['g'],
  77                                   Bxy.ci.upper.true = true.ci.Bxy[2],
  78                                   Bxy.ci.lower.true = true.ci.Bxy[1],
  79                                   Bgy.ci.upper.true = true.ci.Bgy[2],
  80                                   Bgy.ci.lower.true = true.ci.Bgy[1]))
  81
  82     (model.feasible <- glm(y.obs~x+g,data=df,family=binomial(link='logit')))
  83
  84     feasible.ci.Bxy <- confint(model.feasible)['x',]
  85     result <- append(result, list(Bxy.est.feasible=coef(model.feasible)['x'],
  86                                   Bxy.ci.upper.feasible = feasible.ci.Bxy[2],
  87                                   Bxy.ci.lower.feasible = feasible.ci.Bxy[1]))
  88
  89     feasible.ci.Bgy <- confint(model.feasible)['g',]
  90     result <- append(result, list(Bgy.est.feasible=coef(model.feasible)['g'],
  91                                   Bgy.ci.upper.feasible = feasible.ci.Bgy[2],
  92                                   Bgy.ci.lower.feasible = feasible.ci.Bgy[1]))
  93
  94     (model.naive <- glm(w_pred~x+g, data=df, family=binomial(link='logit')))
  95
  96     naive.ci.Bxy <- confint(model.naive)['x',]
  97     naive.ci.Bgy <- confint(model.naive)['g',]
  98
  99     result <- append(result, list(Bxy.est.naive=coef(model.naive)['x'],
 100                                   Bgy.est.naive=coef(model.naive)['g'],
 101                                   Bxy.ci.upper.naive = naive.ci.Bxy[2],
 102                                   Bxy.ci.lower.naive = naive.ci.Bxy[1],
 103                                   Bgy.ci.upper.naive = naive.ci.Bgy[2],
 104                                   Bgy.ci.lower.naive = naive.ci.Bgy[1]))
 105
 106
 107     (model.naive.cont <- lm(w~x+g, data=df))
 108     naivecont.ci.Bxy <- confint(model.naive.cont)['x',]
 109     naivecont.ci.Bgy <- confint(model.naive.cont)['g',]
 110
 111     ## my implementatoin of liklihood based correction
 112     mod.caroll.lik <- logistic.correction.liklihood(df)
 113     coef <- coef(mod.caroll.lik)
 114     ci <- confint(mod.caroll.lik)
 115
 116     result <- append(result,
 117                      list(Bxy.est.mle = coef['Bxy'],
 118                           Bxy.ci.upper.mle = ci['Bxy','97.5 %'],
 119                           Bxy.ci.lower.mle = ci['Bxy','2.5 %'],
 120                           Bgy.est.mle = coef['Bgy'],
 121                           Bgy.ci.upper.mle = ci['Bgy','97.5 %'],
 122                           Bgy.ci.lower.mle = ci['Bgy','2.5 %']))
 123
 124
 125     ## my implementatoin of liklihood based correction
 126     mod.caroll.pseudo <- logistic.correction.pseudo(df)
 127     coef <- coef(mod.caroll.pseudo)
 128     ci <- confint(mod.caroll.pseudo)
 129
 130     result <- append(result,
 131                      list(Bxy.est.pseudo = coef['Bxy'],
 132                           Bxy.ci.upper.pseudo = ci['Bxy','97.5 %'],
 133                           Bxy.ci.lower.pseudo = ci['Bxy','2.5 %'],
 134                           Bgy.est.pseudo = coef['Bgy'],
 135                           Bgy.ci.upper.pseudo = ci['Bgy','97.5 %'],
 136                           Bgy.ci.lower.pseudo = ci['Bgy','2.5 %']))
 137
 138
 139     # amelia says use normal distribution for binary variables.
 140     amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('y','ystar','w_pred'))
 141     mod.amelia.k <- zelig(y.obs~x+g, model='ls', data=amelia.out.k$imputations, cite=FALSE)
 142     (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE))
 143
 144     est.x.mi <- coefse['x','Estimate']
 145     est.x.se <- coefse['x','Std.Error']
 146     result <- append(result,
 147                      list(Bxy.est.amelia.full = est.x.mi,
 148                           Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
 149                           Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
 150                           ))
 151
 152     est.g.mi <- coefse['g','Estimate']
 153     est.g.se <- coefse['g','Std.Error']
 154
 155     result <- append(result,
 156                      list(Bgy.est.amelia.full = est.g.mi,
 157                           Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se,
 158                           Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se
 159                           ))
 160
 161     return(result)
 162
 163 }
 164
 165 run_simulation <-  function(df, result){
 166
 167     accuracy <- df[,mean(w_pred==x)]
 168     result <- append(result, list(accuracy=accuracy))
 169
 170     (model.true <- lm(y ~ x + g, data=df))
 171     true.ci.Bxy <- confint(model.true)['x',]
 172     true.ci.Bgy <- confint(model.true)['g',]
 173
 174     result <- append(result, list(Bxy.est.true=coef(model.true)['x'],
 175                                   Bgy.est.true=coef(model.true)['g'],
 176                                   Bxy.ci.upper.true = true.ci.Bxy[2],
 177                                   Bxy.ci.lower.true = true.ci.Bxy[1],
 178                                   Bgy.ci.upper.true = true.ci.Bgy[2],
 179                                   Bgy.ci.lower.true = true.ci.Bgy[1]))
 180
 181     (model.feasible <- lm(y~x.obs+g,data=df))
 182
 183     feasible.ci.Bxy <- confint(model.feasible)['x.obs',]
 184     result <- append(result, list(Bxy.est.feasible=coef(model.feasible)['x.obs'],
 185                                   Bxy.ci.upper.feasible = feasible.ci.Bxy[2],
 186                                   Bxy.ci.lower.feasible = feasible.ci.Bxy[1]))
 187
 188     feasible.ci.Bgy <- confint(model.feasible)['g',]
 189     result <- append(result, list(Bgy.est.feasible=coef(model.feasible)['g'],
 190                                   Bgy.ci.upper.feasible = feasible.ci.Bgy[2],
 191                                   Bgy.ci.lower.feasible = feasible.ci.Bgy[1]))
 192
 193     (model.naive <- lm(y~w+g, data=df))
 194
 195     naive.ci.Bxy <- confint(model.naive)['w',]
 196     naive.ci.Bgy <- confint(model.naive)['g',]
 197
 198     result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'],
 199                                   Bgy.est.naive=coef(model.naive)['g'],
 200                                   Bxy.ci.upper.naive = naive.ci.Bxy[2],
 201                                   Bxy.ci.lower.naive = naive.ci.Bxy[1],
 202                                   Bgy.ci.upper.naive = naive.ci.Bgy[2],
 203                                   Bgy.ci.lower.naive = naive.ci.Bgy[1]))
 204
 205
 206     amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w_pred'))
 207     mod.amelia.k <- zelig(y~x.obs+g, model='ls', data=amelia.out.k$imputations, cite=FALSE)
 208     (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE))
 209
 210     est.x.mi <- coefse['x.obs','Estimate']
 211     est.x.se <- coefse['x.obs','Std.Error']
 212     result <- append(result,
 213                      list(Bxy.est.amelia.full = est.x.mi,
 214                           Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
 215                           Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
 216                           ))
 217
 218     est.g.mi <- coefse['g','Estimate']
 219     est.g.se <- coefse['g','Std.Error']
 220
 221     result <- append(result,
 222                      list(Bgy.est.amelia.full = est.g.mi,
 223                           Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se,
 224                           Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se
 225                           ))
 226
 227     ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model.
 228     ## amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","w_pred"), noms=noms)
 229     ## mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE)
 230     ## (coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE))
 231
 232     ## est.x.mi <- coefse['x.obs','Estimate']
 233     ## est.x.se <- coefse['x.obs','Std.Error']
 234     ## result <- append(result,
 235     ##                  list(Bxy.est.amelia.nok = est.x.mi,
 236     ##                       Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se,
 237     ##                       Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se
 238     ##                       ))
 239
 240     ## est.g.mi <- coefse['g','Estimate']
 241     ## est.g.se <- coefse['g','Std.Error']
 242
 243     ## result <- append(result,
 244     ##                  list(Bgy.est.amelia.nok = est.g.mi,
 245     ##                       Bgy.ci.upper.amelia.nok = est.g.mi + 1.96 * est.g.se,
 246     ##                       Bgy.ci.lower.amelia.nok = est.g.mi - 1.96 * est.g.se
 247     ##                       ))
 248
 249     N <- nrow(df)
 250     m <- nrow(df[!is.na(x.obs)])
 251     p <- v <- train <- rep(0,N)
 252     M <- m
 253     p[(M+1):(N)] <- 1
 254     v[1:(M)] <- 1
 255     df <- df[order(x.obs)]
 256     y <- df[,y]
 257     x <- df[,x.obs]
 258     g <- df[,g]
 259     w <- df[,w]
 260     # gmm gets pretty close
 261     (gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=TRUE))
 262
 263     result <- append(result,
 264                      list(Bxy.est.gmm = gmm.res$beta[1,1],
 265                           Bxy.ci.upper.gmm = gmm.res$confint[1,2],
 266                           Bxy.ci.lower.gmm = gmm.res$confint[1,1],
 267                           gmm.ER_pval = gmm.res$ER_pval
 268                           ))
 269
 270     result <- append(result,
 271                      list(Bgy.est.gmm = gmm.res$beta[2,1],
 272                           Bgy.ci.upper.gmm = gmm.res$confint[2,2],
 273                           Bgy.ci.lower.gmm = gmm.res$confint[2,1]))
 274
 275
 276     mod.calibrated.mle <- mecor(y ~ MeasError(w, reference = x.obs) + g, df, B=400, method='efficient')
 277     (mod.calibrated.mle)
 278     (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',])
 279     result <- append(result, list(
 280                                  Bxy.est.mecor = mecor.ci['Estimate'],
 281                                  Bxy.upper.mecor = mecor.ci['UCI'],
 282                                  Bxy.lower.mecor = mecor.ci['LCI'])
 283                      )
 284
 285     (mecor.ci <- summary(mod.calibrated.mle)$c$ci['g',])
 286
 287     result <- append(result, list(
 288                                  Bgy.est.mecor = mecor.ci['Estimate'],
 289                                  Bgy.upper.mecor = mecor.ci['UCI'],
 290                                  Bgy.lower.mecor = mecor.ci['LCI'])
 291                      )
 292
 293 ##    clean up memory
 294 ##    rm(list=c("df","y","x","g","w","v","train","p","amelia.out.k","amelia.out.nok", "mod.calibrated.mle","gmm.res","mod.amelia.k","mod.amelia.nok", "model.true","model.naive","model.feasible"))
 295
 296 ##    gc()
 297     return(result)
 298 }