]> code.communitydata.science - ml_measurement_error_public.git/blob - simulations/measerr_methods.R
fdc4978b72e32c7988634fb2265681c61a033b96
[ml_measurement_error_public.git] / simulations / measerr_methods.R
1 library(formula.tools)
2 library(matrixStats)
3 library(optimx)
4 library(bbmle)
5 ## df: dataframe to model
6 ## outcome_formula: formula for y | x, z
7 ## outcome_family: family for y | x, z
8 ## proxy_formula: formula for w | x, z, y 
9 ## proxy_family: family for w | x, z, y
10 ## truth_formula: formula for x | z
11 ## truth_family: family for x | z
12
13 ### ideal formulas for example 1
14 # test.fit.1 <- measerr_mle(df, y ~ x + z, gaussian(), w_pred ~ x, binomial(link='logit'), x ~ z)
15
16 ### ideal formulas for example 2
17 # test.fit.2 <- measerr_mle(df, y ~ x + z, gaussian(), w_pred ~ x + z + y + y:x, binomial(link='logit'), x ~ z)
18
19
20 ## outcome_formula <- y ~ x + z; proxy_formula <- w_pred ~ y + x + z + x:z + x:y + z:y 
21 measerr_mle_dv <- function(df, outcome_formula, outcome_family=binomial(link='logit'), proxy_formula, proxy_family=binomial(link='logit'),method='optim'){
22     df.obs <- model.frame(outcome_formula, df)
23     proxy.model.matrix <- model.matrix(proxy_formula, df)
24     proxy.variable <- all.vars(proxy_formula)[1]
25
26     df.proxy.obs <- model.frame(proxy_formula,df)
27     proxy.obs <- with(df.proxy.obs, eval(parse(text=proxy.variable)))
28
29     response.var <- all.vars(outcome_formula)[1]
30     y.obs <- with(df.obs,eval(parse(text=response.var)))
31     outcome.model.matrix <- model.matrix(outcome_formula, df.obs)
32
33     df.unobs <- df[is.na(df[[response.var]])]
34     df.unobs.y1 <- copy(df.unobs)
35     df.unobs.y1[[response.var]] <- 1
36     df.unobs.y0 <- copy(df.unobs)
37     df.unobs.y0[[response.var]] <- 0
38
39     outcome.model.matrix.y1 <- model.matrix(outcome_formula, df.unobs.y1)
40     proxy.model.matrix.y1 <- model.matrix(proxy_formula, df.unobs.y1)
41     proxy.model.matrix.y0 <- model.matrix(proxy_formula, df.unobs.y0)
42     proxy.unobs <- with(df.unobs, eval(parse(text=proxy.variable)))
43
44     nll <- function(params){
45
46         param.idx <- 1
47         n.outcome.model.covars <- dim(outcome.model.matrix)[2]
48         outcome.params <- params[param.idx:n.outcome.model.covars]
49         param.idx <- param.idx + n.outcome.model.covars
50
51         if((outcome_family$family == "binomial") & (outcome_family$link == 'logit')){
52             ll.y.obs <- vector(mode='numeric', length=length(y.obs))
53             ll.y.obs[y.obs==1] <- plogis(outcome.params %*% t(outcome.model.matrix[y.obs==1,]),log=TRUE)
54             ll.y.obs[y.obs==0] <- plogis(outcome.params %*% t(outcome.model.matrix[y.obs==0,]),log=TRUE,lower.tail=FALSE)
55         }
56
57         n.proxy.model.covars <- dim(proxy.model.matrix)[2]
58         proxy.params <- params[param.idx:(n.proxy.model.covars+param.idx-1)]
59         param.idx <- param.idx + n.proxy.model.covars
60
61         if( (proxy_family$family=="binomial") & (proxy_family$link=='logit')){
62             ll.w.obs <- vector(mode='numeric',length=dim(proxy.model.matrix)[1])
63             ll.w.obs[proxy.obs==1] <- plogis(proxy.params %*% t(proxy.model.matrix[proxy.obs==1,]),log=TRUE)
64             ll.w.obs[proxy.obs==0] <- plogis(proxy.params %*% t(proxy.model.matrix[proxy.obs==0,]),log=TRUE, lower.tail=FALSE)
65         }
66
67         ll.obs <- sum(ll.y.obs + ll.w.obs)
68         
69         ## integrate out y
70
71         if((outcome_family$family == "binomial") & (outcome_family$link == 'logit')){
72             ll.y.unobs.1 <- vector(mode='numeric', length=dim(outcome.model.matrix.y1)[1])
73             ll.y.unobs.0 <- vector(mode='numeric', length=dim(outcome.model.matrix.y1)[1])
74             ll.y.unobs.1 <- plogis(outcome.params %*% t(outcome.model.matrix.y1),log=TRUE)
75             ll.y.unobs.0 <- plogis(outcome.params %*% t(outcome.model.matrix.y1),log=TRUE,lower.tail=FALSE)
76         }
77
78         if( (proxy_family$family=="binomial") & (proxy_family$link=='logit')){
79             ll.w.unobs.1 <- vector(mode='numeric',length=dim(proxy.model.matrix.y1)[1])
80             ll.w.unobs.0 <- vector(mode='numeric',length=dim(proxy.model.matrix.y0)[1])
81             ll.w.unobs.1[proxy.unobs==1] <- plogis(proxy.params %*% t(proxy.model.matrix.y1[proxy.unobs==1,]),log=TRUE)
82             ll.w.unobs.1[proxy.unobs==0] <- plogis(proxy.params %*% t(proxy.model.matrix.y1[proxy.unobs==0,]),log=TRUE, lower.tail=FALSE)
83
84             ll.w.unobs.0[proxy.unobs==1] <- plogis(proxy.params %*% t(proxy.model.matrix.y0[proxy.unobs==1,]),log=TRUE)
85             ll.w.unobs.0[proxy.unobs==0] <- plogis(proxy.params %*% t(proxy.model.matrix.y0[proxy.unobs==0,]),log=TRUE, lower.tail=FALSE)
86         }
87
88         ll.unobs.1 <- ll.y.unobs.1 + ll.w.unobs.1
89         ll.unobs.0 <- ll.y.unobs.0 + ll.w.unobs.0
90         ll.unobs <- sum(colLogSumExps(rbind(ll.unobs.1,ll.unobs.0)))
91         ll <- ll.unobs + ll.obs
92         return(-ll)
93     }
94     
95     params <- colnames(model.matrix(outcome_formula,df))
96     lower <- rep(-Inf, length(params))
97     proxy.params <- colnames(model.matrix(proxy_formula, df))
98     params <- c(params, paste0('proxy_',proxy.params))
99     lower <- c(lower, rep(-Inf, length(proxy.params)))
100     start <- rep(0.1,length(params))
101     names(start) <- params
102     
103     if(method=='optim'){
104         fit <- optim(start, fn = nll, lower=lower, method='L-BFGS-B', hessian=TRUE, control=list(maxit=1e6))
105     } else {
106         quoted.names <- gsub("[\\(\\)]",'',names(start))
107         print(quoted.names)
108         text <- paste("function(", paste0(quoted.names,'=',start,collapse=','),"){params<-c(",paste0(quoted.names,collapse=','),");return(nll(params))}")
109
110         measerr_mle_nll <- eval(parse(text=text))
111         names(start) <- quoted.names
112         names(lower) <- quoted.names
113         fit <- mle2(minuslogl=measerr_mle_nll, start=start, lower=lower, parnames=params,control=list(maxit=1e6),method='L-BFGS-B')
114     }
115     return(fit)
116 }
117
118
119 measerr_mle <- function(df, outcome_formula, outcome_family=gaussian(), proxy_formula, proxy_family=binomial(link='logit'), truth_formula, truth_family=binomial(link='logit'),method='optim'){
120
121     df.obs <- model.frame(outcome_formula, df)
122     response.var <- all.vars(outcome_formula)[1]
123     proxy.variable <- all.vars(proxy_formula)[1]
124     truth.variable <- all.vars(truth_formula)[1]
125     outcome.model.matrix <- model.matrix(outcome_formula, df)
126     proxy.model.matrix <- model.matrix(proxy_formula, df)
127     y.obs <- with(df.obs,eval(parse(text=response.var)))
128
129     measerr_mle_nll <- function(params){
130         param.idx <- 1
131         n.outcome.model.covars <- dim(outcome.model.matrix)[2]
132         outcome.params <- params[param.idx:n.outcome.model.covars]
133         param.idx <- param.idx + n.outcome.model.covars
134
135         ## likelihood for the fully observed data 
136         if(outcome_family$family == "gaussian"){
137             sigma.y <- params[param.idx]
138             param.idx <- param.idx + 1
139                                         #  outcome_formula likelihood using linear regression
140             ll.y.obs <- dnorm(y.obs, outcome.params %*% t(outcome.model.matrix),sd=sigma.y, log=TRUE)
141         }
142         
143         df.obs <- model.frame(proxy_formula,df)
144         n.proxy.model.covars <- dim(proxy.model.matrix)[2]
145         proxy.params <- params[param.idx:(n.proxy.model.covars+param.idx-1)]
146         param.idx <- param.idx + n.proxy.model.covars
147         proxy.obs <- with(df.obs, eval(parse(text=proxy.variable)))
148
149         if( (proxy_family$family=="binomial") & (proxy_family$link=='logit')){
150             ll.w.obs <- vector(mode='numeric',length=dim(proxy.model.matrix)[1])
151
152                                         # proxy_formula likelihood using logistic regression
153             ll.w.obs[proxy.obs==1] <- plogis(proxy.params %*% t(proxy.model.matrix[proxy.obs==1,]),log=TRUE)
154             ll.w.obs[proxy.obs==0] <- plogis(proxy.params %*% t(proxy.model.matrix[proxy.obs==0,]),log=TRUE, lower.tail=FALSE)
155         }
156
157         df.obs <- model.frame(truth_formula, df)
158
159         truth.obs <- with(df.obs, eval(parse(text=truth.variable)))
160         truth.model.matrix <- model.matrix(truth_formula,df)
161         n.truth.model.covars <- dim(truth.model.matrix)[2]
162         
163         truth.params <- params[param.idx:(n.truth.model.covars + param.idx - 1)]
164
165         if( (truth_family$family=="binomial") & (truth_family$link=='logit')){
166             ll.x.obs <- vector(mode='numeric',length=dim(truth.model.matrix)[1])
167
168                                         # truth_formula likelihood using logistic regression
169             ll.x.obs[truth.obs==1] <- plogis(truth.params %*% t(truth.model.matrix[truth.obs==1,]),log=TRUE)
170             ll.x.obs[truth.obs==0] <- plogis(truth.params %*% t(truth.model.matrix[truth.obs==0,]),log=TRUE, lower.tail=FALSE)
171         }
172         
173                                         # add the three likelihoods
174         ll.obs <- sum(ll.y.obs + ll.w.obs + ll.x.obs)
175
176         ## likelihood for the predicted data
177         ## integrate out the "truth" variable. 
178         
179         if(truth_family$family=='binomial'){
180             df.unobs <- df[is.na(eval(parse(text=truth.variable)))]
181             df.unobs.x1 <- copy(df.unobs)
182             df.unobs.x1[,'x'] <- 1
183             df.unobs.x0 <- copy(df.unobs)
184             df.unobs.x0[,'x'] <- 0
185             outcome.unobs <- with(df.unobs, eval(parse(text=response.var)))
186             
187             outcome.model.matrix.x0 <- model.matrix(outcome_formula, df.unobs.x0)
188             outcome.model.matrix.x1 <- model.matrix(outcome_formula, df.unobs.x1)
189             if(outcome_family$family=="gaussian"){
190
191                                         # likelihood of outcome
192                 ll.y.x0 <- dnorm(outcome.unobs, outcome.params %*% t(outcome.model.matrix.x0), sd=sigma.y, log=TRUE)
193                 ll.y.x1 <- dnorm(outcome.unobs, outcome.params %*% t(outcome.model.matrix.x1), sd=sigma.y, log=TRUE)
194             }
195
196             if( (proxy_family$family=='binomial') & (proxy_family$link=='logit')){
197
198                 proxy.model.matrix.x0 <- model.matrix(proxy_formula, df.unobs.x0)
199                 proxy.model.matrix.x1 <- model.matrix(proxy_formula, df.unobs.x1)
200                 proxy.unobs <- df.unobs[[proxy.variable]]
201                 ll.w.x0 <- vector(mode='numeric', length=dim(df.unobs)[1])
202                 ll.w.x1 <- vector(mode='numeric', length=dim(df.unobs)[1])
203
204                                         # likelihood of proxy
205                 ll.w.x0[proxy.unobs==1] <- plogis(proxy.params %*% t(proxy.model.matrix.x0[proxy.unobs==1,]), log=TRUE)
206                 ll.w.x1[proxy.unobs==1] <- plogis(proxy.params %*% t(proxy.model.matrix.x1[proxy.unobs==1,]), log=TRUE)
207
208                 ll.w.x0[proxy.unobs==0] <- plogis(proxy.params %*% t(proxy.model.matrix.x0[proxy.unobs==0,]), log=TRUE,lower.tail=FALSE)
209                 ll.w.x1[proxy.unobs==0] <- plogis(proxy.params %*% t(proxy.model.matrix.x1[proxy.unobs==0,]), log=TRUE,lower.tail=FALSE)
210             }
211
212             if(truth_family$link=='logit'){
213                 truth.model.matrix <- model.matrix(truth_formula, df.unobs.x0)
214                                         # likelihood of truth
215                 ll.x.x1 <- plogis(truth.params %*% t(truth.model.matrix), log=TRUE)
216                 ll.x.x0 <- plogis(truth.params %*% t(truth.model.matrix), log=TRUE, lower.tail=FALSE)
217             }
218         }
219
220         ll.x0 <- ll.y.x0 + ll.w.x0 + ll.x.x0
221         ll.x1 <- ll.y.x1 + ll.w.x1 + ll.x.x1
222         ll.unobs <- sum(colLogSumExps(rbind(ll.x0, ll.x1)))
223         return(-(ll.unobs + ll.obs))
224     }
225     
226     outcome.params <- colnames(model.matrix(outcome_formula,df))
227     lower <- rep(-Inf, length(outcome.params))
228
229     if(outcome_family$family=='gaussian'){
230         params <- c(outcome.params, 'sigma_y')
231         lower <- c(lower, 0.00001)
232     } else {
233         params <- outcome.params
234     }
235     
236     proxy.params <- colnames(model.matrix(proxy_formula, df))
237     params <- c(params, paste0('proxy_',proxy.params))
238     lower <- c(lower, rep(-Inf, length(proxy.params)))
239
240     truth.params <- colnames(model.matrix(truth_formula, df))
241     params <- c(params, paste0('truth_', truth.params))
242     lower <- c(lower, rep(-Inf, length(truth.params)))
243     start <- rep(0.1,length(params))
244     names(start) <- params
245
246     if(method=='optim'){
247         fit <- optim(start, fn = measerr_mle_nll, lower=lower, method='L-BFGS-B', hessian=TRUE, control=list(maxit=1e6))
248     } else { # method='mle2'
249                 
250         quoted.names <- gsub("[\\(\\)]",'',names(start))
251
252         text <- paste("function(", paste0(quoted.names,'=',start,collapse=','),"){params<-c(",paste0(quoted.names,collapse=','),");return(measerr_mle_nll(params))}")
253
254         measerr_mle_nll_mle <- eval(parse(text=text))
255         names(start) <- quoted.names
256         names(lower) <- quoted.names
257         fit <- mle2(minuslogl=measerr_mle_nll_mle, start=start, lower=lower, parnames=params,control=list(maxit=1e6),method='L-BFGS-B')
258     }
259
260     return(fit)
261 }
262
263 ## Experimental, but probably works. 
264 measerr_irr_mle <- function(df, outcome_formula, outcome_family=gaussian(), coder_formulas=list(x.obs.0~x, x.obs.1~x), proxy_formula, proxy_family=binomial(link='logit'), truth_formula, truth_family=binomial(link='logit'),method='optim'){
265
266     ### in this scenario, the ground truth also has measurement error, but we have repeated measures for it. 
267     # this time we never get to observe the true X
268     outcome.model.matrix <- model.matrix(outcome_formula, df)
269     proxy.model.matrix <- model.matrix(proxy_formula, df)
270     response.var <- all.vars(outcome_formula)[1]
271     proxy.var <- all.vars(proxy_formula)[1]
272     param.var <- all.vars(truth_formula)[1]
273     truth.var<- all.vars(truth_formula)[1]
274     y <- with(df,eval(parse(text=response.var)))
275
276     nll <- function(params){
277         param.idx <- 1
278         n.outcome.model.covars <- dim(outcome.model.matrix)[2]
279         outcome.params <- params[param.idx:n.outcome.model.covars]
280         param.idx <- param.idx + n.outcome.model.covars
281
282
283         if(outcome_family$family == "gaussian"){
284             sigma.y <- params[param.idx]
285             param.idx <- param.idx + 1
286         }
287
288
289         n.proxy.model.covars <- dim(proxy.model.matrix)[2]
290         proxy.params <- params[param.idx:(n.proxy.model.covars+param.idx-1)]
291         param.idx <- param.idx + n.proxy.model.covars
292         
293         df.temp <- copy(df)
294
295         if((truth_family$family == "binomial")
296            & (truth_family$link=='logit')){
297             integrate.grid <- expand.grid(replicate(1 + length(coder_formulas), c(0,1), simplify=FALSE))
298             ll.parts <- matrix(nrow=nrow(df),ncol=nrow(integrate.grid))
299             for(i in 1:nrow(integrate.grid)){
300                 # setup the dataframe for this row
301                 row <- integrate.grid[i,]
302
303                 df.temp[[param.var]] <- row[[1]]
304                 ci <- 2
305                 for(coder_formula in coder_formulas){
306                     coder.var <- all.vars(coder_formula)[1]
307                     df.temp[[coder.var]] <- row[[ci]]
308                     ci <- ci + 1 
309                 }
310                 
311                 outcome.model.matrix <- model.matrix(outcome_formula, df.temp)                
312                 if(outcome_family$family == "gaussian"){
313                     ll.y <- dnorm(y, outcome.params %*% t(outcome.model.matrix), sd=sigma.y, log=TRUE)
314                 }
315
316                 if(proxy_family$family=="binomial" & (proxy_family$link=='logit')){
317                     proxy.model.matrix <- model.matrix(proxy_formula, df.temp)
318                     ll.w <- vector(mode='numeric', length=dim(proxy.model.matrix)[1])
319                     proxyvar <- with(df.temp,eval(parse(text=proxy.var)))
320                     ll.w[proxyvar==1] <- plogis(proxy.params %*% t(proxy.model.matrix[proxyvar==1,]),log=TRUE)
321                     ll.w[proxyvar==0] <- plogis(proxy.params %*% t(proxy.model.matrix[proxyvar==0,]),log=TRUE,lower.tail=FALSE)
322                 }
323
324                 ## probability of the coded variables
325                 coder.lls <- matrix(nrow=nrow(df.temp),ncol=length(coder_formulas))
326                 ci <- 1
327                 for(coder_formula in coder_formulas){
328                     coder.model.matrix <- model.matrix(coder_formula, df.temp)
329                     n.coder.model.covars <- dim(coder.model.matrix)[2]
330                     coder.params <- params[param.idx:(n.coder.model.covars + param.idx - 1)]
331                     param.idx <- param.idx + n.coder.model.covars
332                     coder.var <- all.vars(coder_formula)[1]
333                     x.obs <- with(df.temp, eval(parse(text=coder.var)))
334                     true.codervar <- df[[all.vars(coder_formula)[1]]]
335
336                     ll.coder <- vector(mode='numeric', length=dim(coder.model.matrix)[1])
337                     ll.coder[x.obs==1] <- plogis(coder.params %*% t(coder.model.matrix[x.obs==1,]),log=TRUE)
338                     ll.coder[x.obs==0] <- plogis(coder.params %*% t(coder.model.matrix[x.obs==0,]),log=TRUE,lower.tail=FALSE)
339
340                     # don't count when we know the observed value, unless we're accounting for observed value
341                     ll.coder[(!is.na(true.codervar)) & (true.codervar != x.obs)] <- NA
342                     coder.lls[,ci] <- ll.coder
343                     ci <- ci + 1
344                 }
345                 
346                 truth.model.matrix <- model.matrix(truth_formula, df.temp)
347                 n.truth.model.covars <- dim(truth.model.matrix)[2]
348                 truth.params <- params[param.idx:(n.truth.model.covars + param.idx - 1)]
349
350                 for(coder_formula in coder_formulas){
351                     coder.model.matrix <- model.matrix(coder_formula, df.temp)
352                     n.coder.model.covars <- dim(coder.model.matrix)[2]
353                     param.idx <- param.idx - n.coder.model.covars
354                 }
355
356                 x <- with(df.temp, eval(parse(text=truth.var)))
357                 ll.truth <- vector(mode='numeric', length=dim(truth.model.matrix)[1])
358                 ll.truth[x==1] <- plogis(truth.params %*% t(truth.model.matrix[x==1,]), log=TRUE)
359                 ll.truth[x==0] <- plogis(truth.params %*% t(truth.model.matrix[x==0,]), log=TRUE, lower.tail=FALSE)
360
361                 true.truthvar <- df[[all.vars(truth_formula)[1]]]
362                 
363                 if(!is.null(true.truthvar)){
364                                         # ll.truth[(!is.na(true.truthvar)) & (true.truthvar != truthvar)] <- -Inf
365                     # ll.truth[(!is.na(true.truthvar)) & (true.truthvar == truthvar)] <- 0
366                 }
367                 ll.parts[,i] <- ll.y + ll.w + apply(coder.lls,1,sum) + ll.truth
368                 
369             }
370
371             lls <- rowLogSumExps(ll.parts,na.rm=TRUE)
372
373         ## likelihood of observed data 
374             target <- -1 * sum(lls)
375             return(target)
376         }
377     }
378         
379     outcome.params <- colnames(model.matrix(outcome_formula,df))
380     lower <- rep(-Inf, length(outcome.params))
381
382     if(outcome_family$family=='gaussian'){
383         params <- c(outcome.params, 'sigma_y')
384         lower <- c(lower, 0.00001)
385     } else {
386         params <- outcome.params
387     }
388
389     proxy.params <- colnames(model.matrix(proxy_formula, df))
390     params <- c(params, paste0('proxy_',proxy.params))
391     positive.params <- paste0('proxy_',truth.var)
392     lower <- c(lower, rep(-Inf, length(proxy.params)))
393     names(lower) <- params
394     lower[positive.params] <- 0.01
395     ci <- 0
396     
397     for(coder_formula in coder_formulas){
398         coder.params <- colnames(model.matrix(coder_formula,df))
399         params <- c(params, paste0('coder_',ci,coder.params))
400         positive.params <- paste0('coder_', ci, truth.var)
401         ci <- ci + 1
402         lower <- c(lower, rep(-Inf, length(coder.params)))
403         names(lower) <- params
404         lower[positive.params] <- 0.01
405     }
406
407     truth.params <- colnames(model.matrix(truth_formula, df))
408     params <- c(params, paste0('truth_', truth.params))
409     lower <- c(lower, rep(-Inf, length(truth.params)))
410     start <- rep(0.1,length(params))
411     names(start) <- params
412     names(lower) <- params
413     
414     if(method=='optim'){
415         print(start)
416         fit <- optim(start, fn = nll, lower=lower, method='L-BFGS-B', hessian=TRUE, control=list(maxit=1e6))
417     } else {
418                 
419         quoted.names <- gsub("[\\(\\)]",'',names(start))
420         print(quoted.names)
421         text <- paste("function(", paste0(quoted.names,'=',start,collapse=','),"){params<-c(",paste0(quoted.names,collapse=','),");return(nll(params))}")
422
423         measerr_mle_nll <- eval(parse(text=text))
424         names(start) <- quoted.names
425         names(lower) <- quoted.names
426         fit <- mle2(minuslogl=measerr_mle_nll, start=start, lower=lower, method='L-BFGS-B',control=list(maxit=1e6))
427     }
428
429     return(fit)
430 }
431
432 ## Experimental, and does not work.
433 measerr_irr_mle_dv <- function(df, outcome_formula, outcome_family=binomial(link='logit'), coder_formulas=list(y.obs.0~y+w_pred+y.obs.1,y.obs.1~y+w_pred+y.obs.0), proxy_formula=w_pred~y, proxy_family=binomial(link='logit'),method='optim'){
434     integrate.grid <- expand.grid(replicate(1 + length(coder_formulas), c(0,1), simplify=FALSE))
435 #    print(integrate.grid)
436
437
438     outcome.model.matrix <- model.matrix(outcome_formula, df)
439     n.outcome.model.covars <- dim(outcome.model.matrix)[2]
440
441
442     ### in this scenario, the ground truth also has measurement error, but we have repeated measures for it. 
443     # this time we never get to observe the true X
444     nll <- function(params){
445         param.idx <- 1
446         outcome.params <- params[param.idx:n.outcome.model.covars]
447         param.idx <- param.idx + n.outcome.model.covars
448         proxy.model.matrix <- model.matrix(proxy_formula, df)
449         n.proxy.model.covars <- dim(proxy.model.matrix)[2]
450         response.var <- all.vars(outcome_formula)[1]
451
452         if(outcome_family$family == "gaussian"){
453             sigma.y <- params[param.idx]
454             param.idx <- param.idx + 1
455         }
456
457         proxy.params <- params[param.idx:(n.proxy.model.covars+param.idx-1)]
458         param.idx <- param.idx + n.proxy.model.covars
459
460         df.temp <- copy(df)
461
462         if((outcome_family$family == "binomial")
463            & (outcome_family$link=='logit')){
464             ll.parts <- matrix(nrow=nrow(df),ncol=nrow(integrate.grid))
465             for(i in 1:nrow(integrate.grid)){
466                 # setup the dataframe for this row
467                 row <- integrate.grid[i,]
468
469                 df.temp[[response.var]] <- row[[1]]
470                 ci <- 2
471                 for(coder_formula in coder_formulas){
472                     codervar <- all.vars(coder_formula)[1]
473                     df.temp[[codervar]] <- row[[ci]]
474                     ci <- ci + 1 
475                 }
476                 
477                 outcome.model.matrix <- model.matrix(outcome_formula, df.temp)                
478                 if(outcome_family$family == "gaussian"){
479                     ll.y <- dnorm(df.temp[[response.var]], outcome.params %*% t(outcome.model.matrix), sd=sigma.y, log=T)
480                 }
481
482                 if(outcome_family$family == "binomial" & (outcome_family$link=='logit')){
483                     ll.y <- vector(mode='numeric',length=nrow(df.temp))
484                     ll.y[df.temp[[response.var]]==1] <- plogis( outcome.params %*% t(outcome.model.matrix), log=TRUE)
485                     ll.y[df.temp[[response.var]]==0] <- plogis( outcome.params %*% t(outcome.model.matrix), log=TRUE,lower.tail=FALSE)
486                 }
487
488                 if(proxy_family$family=="binomial" & (proxy_family$link=='logit')){
489                     proxy.model.matrix <- model.matrix(proxy_formula, df.temp)
490                     ll.w <- vector(mode='numeric', length=dim(proxy.model.matrix)[1])
491                     proxyvar <- with(df.temp,eval(parse(text=all.vars(proxy_formula)[1])))
492                     ll.w[proxyvar==1] <- plogis(proxy.params %*% t(proxy.model.matrix[proxyvar==1,]),log=TRUE)
493                     ll.w[proxyvar==0] <- plogis(proxy.params %*% t(proxy.model.matrix[proxyvar==0,]),log=TRUE,lower.tail=FALSE)
494                 }
495
496                 ## probability of the coded variables
497                 coder.lls <- matrix(nrow=nrow(df.temp),ncol=length(coder_formulas))
498                 ci <- 1
499                 for(coder_formula in coder_formulas){
500                     coder.model.matrix <- model.matrix(coder_formula, df.temp)
501                     n.coder.model.covars <- dim(coder.model.matrix)[2]
502                     coder.params <- params[param.idx:(n.coder.model.covars + param.idx - 1)]
503                     param.idx <- param.idx + n.coder.model.covars
504                     codervar <- with(df.temp, eval(parse(text=all.vars(coder_formula)[1])))
505                     true.codervar <- df[[all.vars(coder_formula)[1]]]
506
507                     ll.coder <- vector(mode='numeric', length=dim(coder.model.matrix)[1])
508                     ll.coder[codervar==1] <- plogis(coder.params %*% t(coder.model.matrix[codervar==1,]),log=TRUE)
509                     ll.coder[codervar==0] <- plogis(coder.params %*% t(coder.model.matrix[codervar==0,]),log=TRUE,lower.tail=FALSE)
510
511                     # don't count when we know the observed value, unless we're accounting for observed value
512                     ll.coder[(!is.na(true.codervar)) & (true.codervar != codervar)] <- NA
513                     coder.lls[,ci] <- ll.coder
514                     ci <- ci + 1
515                 }
516
517                 for(coder_formula in coder_formulas){
518                     coder.model.matrix <- model.matrix(coder_formula, df.temp)
519                     n.coder.model.covars <- dim(coder.model.matrix)[2]
520                     param.idx <- param.idx - n.coder.model.covars
521                 }
522
523                 ll.parts[,i] <- ll.y + ll.w + apply(coder.lls,1,function(x) sum(x)) 
524                 
525             }
526
527             lls <- rowLogSumExps(ll.parts,na.rm=TRUE)
528
529             ## likelihood of observed data 
530             target <- -1 * sum(lls)
531 #            print(target)
532 #            print(params)
533             return(target)
534         }
535     }
536         
537     outcome.params <- colnames(model.matrix(outcome_formula,df))
538     response.var <- all.vars(outcome_formula)[1]
539     lower <- rep(-Inf, length(outcome.params))
540
541     if(outcome_family$family=='gaussian'){
542         params <- c(outcome.params, 'sigma_y')
543         lower <- c(lower, 0.00001)
544     } else {
545         params <- outcome.params
546     }
547
548     ## constrain the model of the coder and proxy vars
549     ## this is to ensure identifiability
550     ## it is a safe assumption because the coders aren't hostile (wrong more often than right)
551     ## so we can assume that y ~Bw, B is positive
552     proxy.params <- colnames(model.matrix(proxy_formula, df))
553     positive.params <- paste0('proxy_',response.var)
554     params <- c(params, paste0('proxy_',proxy.params))
555     lower <- c(lower, rep(-Inf, length(proxy.params)))
556     names(lower) <- params
557     lower[positive.params] <- 0.001
558
559     ci <- 0
560     for(coder_formula in coder_formulas){
561         coder.params <- colnames(model.matrix(coder_formula,df))
562         latent.coder.params <- coder.params %in% response.var
563         params <- c(params, paste0('coder_',ci,coder.params))
564         positive.params <- paste0('coder_',ci,response.var)
565         ci <- ci + 1
566         lower <- c(lower, rep(-Inf, length(coder.params)))
567         names(lower) <-params
568         lower[positive.params] <- 0.001
569     }
570
571     ## init by using the "loco model"
572     temp.df <- copy(df)
573     temp.df <- temp.df[y.obs.1 == y.obs.0, y:=y.obs.1]
574     loco.model <- glm(outcome_formula, temp.df, family=outcome_family)
575     
576     start <- rep(1,length(params))
577     names(start) <- params
578     start[names(coef(loco.model))] <- coef(loco.model)
579     names(lower) <- params
580     if(method=='optim'){
581         print(lower)
582         fit <- optim(start, fn = nll, lower=lower, method='L-BFGS-B', hessian=TRUE,control=list(maxit=1e6))
583     } else {
584                 
585         quoted.names <- gsub("[\\(\\)]",'',names(start))
586         print(quoted.names)
587         text <- paste("function(", paste0(quoted.names,'=',start,collapse=','),"){params<-c(",paste0(quoted.names,collapse=','),");return(nll(params))}")
588
589         measerr_mle_nll <- eval(parse(text=text))
590         names(start) <- quoted.names
591         names(lower) <- quoted.names
592         fit <- mle2(minuslogl=measerr_mle_nll, start=start, lower=lower, parnames=params,control=list(maxit=1e6),method='L-BFGS-B')
593     }
594
595     return(fit)
596 }
597

Community Data Science Collective || Want to submit a patch?