]> code.communitydata.science - articlequality_ordinal.git/blob - analyze_quality_models.R
add the rest of the code.
[articlequality_ordinal.git] / analyze_quality_models.R
1 library(MASS)
2 library(brms)
3 options(mc.cores=28)
4 library(ggplot2)
5 library(data.table)
6 library(arrow)
7 library(wCorr)
8
9 source("RemembR/R/RemembeR.R")
10
11 change.remember.file("ordinal.quality.analysis.RDS")
12
13 #model.1 <- readRDS("models/ordinal_quality_intercept.RDS")
14 model.main.pca <- readRDS("models/ordinal_quality_pca.RDS")
15 model.main.pca.cumulative <- readRDS("models/ordinal_quality_pca.cumulative.RDS")
16 model.qe6 <- readRDS("models/ordinal_quality_qe6.RDS")
17 df <- readRDS("data/training_quality_labels.RDS")
18
19 # then compare them with loo
20 comparison.loo <- loo_compare(model.main.pca,model.qe6,model.main.pca.cumulative)
21 #comparison.waic <- loo_compare(model.main.noC,model.main.noB,model.main.noFa,model.main.noGa,model.main.noStart,model.main.noStub,criterion='waic')
22 print(comparison.loo,simplify=F,digits=2)
23 remember(comparison.loo,"comparison.loo")
24
25 # LOO Chooses NoC
26 best.model <- model.main.pca.cumulative
27
28 pca_features <- readRDS("data/ores_pca_features.RDS")
29 pca_features_unweighted <- readRDS("data/ores_pca_features.noweights.RDS")
30
31 test.df <- readRDS("data/holdout_quality_labels.RDS")
32
33 wpca_transform <- function(wpca, x){
34     x <- as.matrix(x)
35     centered <- as.matrix(t(t(x) - wpca$means))
36     return(centered %*% wpca$basis)
37 }
38
39 new_pca_features <- wpca_transform(pca_features, test.df[,.(Stub, Start, C, B, GA, FA)])
40
41 test.df<-test.df[,":="(pca1 = new_pca_features[,1],
42                        pca2 = new_pca_features[,2],
43                        pca3 = new_pca_features[,3],
44                        pca4 = new_pca_features[,4],
45                        pca5 = new_pca_features[,5])]
46
47 unweighted.pca <- wpca_transform(pca_features_unweighted, test.df[,.(Stub, Start, C, B, GA, FA)])
48
49 test.df <- test.df[,":="(pca1.noweights = unweighted.pca[,1],
50                          pca2.noweights = unweighted.pca[,2],
51                          pca3.noweights = unweighted.pca[,3],
52                          pca4.noweights = unweighted.pca[,4],
53                          pca5.noweights = unweighted.pca[,5],
54                          pca6.noweights = unweighted.pca[,6])]
55
56 draws <- as.data.table(best.model)
57
58 test.df <- test.df[,idx.max:=.(apply(test.df[,.(Stub,Start,C,B,GA,FA)],1,which.max))]
59 test.df <- test.df[,MPQC:=.(apply(test.df[,.(idx.max)],1,function(idx) c("stub","start","c","b","ga","fa")[idx]))]
60 top.preds <- test.df[,MPQC]
61
62 #ordinal.fitted.1 <- fitted(best.model, test.df, scale='response')
63 ordinal.fitted <- data.table(fitted(best.model, test.df, scale='linear'))
64 ordinal.pred <- ordinal.fitted$Estimate
65 remember(ordinal.fitted,'ordinal.fitted')
66 ordinal.quality <- ordinal.pred
67 quality.even6 <- apply(test.df[,.(Stub,Start,B,C,GA,FA)],1,function(r) r %*% c(0,1,2,3,4,5))
68 quality.even5 <- apply(test.df[,.(Stub,Start,B,GA,FA)],1,function(r) r %*% c(1,2,3,4,5))
69
70 test.df <- test.df[,quality.ordinal := ordinal.quality]
71 test.df <- test.df[,quality.even6 := quality.even6]
72
73 (spearcor <- weightedCorr(test.df$quality.ordinal, test.df$quality.even6, method='spearman', weights=test.df$article_weight))
74 remember(spearcor, 'spearman.corr')
75 (pearsoncor <- weightedCorr(test.df$quality.ordinal, test.df$quality.even6, method='pearson', weights=test.df$article_weight))
76 remember(pearsoncor, 'pearson.corr')
77
78 ordinal.preds <- data.table(predict(best.model, test.df, robust=F))
79 #names(ordinal.preds) <- c("Stub","Start","C","B","A","GA","FA")
80 names(ordinal.preds) <- c("Stub","Start","C","B","GA","FA")
81 ordinal.preds <- ordinal.preds[,idx.max:=.(apply(ordinal.preds[,.(Stub,Start,C,B,GA,FA)],1,which.max))]
82 #ordinal.preds <- ordinal.preds[,predicted:=.(apply(ordinal.preds[,.(idx.max)],1,function(idx) c("stub","start","c","b",'a',"ga","fa")[idx]))]
83 ordinal.preds <- ordinal.preds[,predicted:=.(apply(ordinal.preds[,.(idx.max)],1,function(idx) c("stub","start","c","b","ga","fa")[idx]))]
84 pred.qe6 <- data.table(predict(model.qe6,test.df))
85 names(pred.qe6) <- c("Stub","Start","C","B","GA","FA")
86 pred.qe6 <- pred.qe6[,idx.max:=.(apply(pred.qe6[,.(Stub,Start,C,B,GA,FA)],1,which.max))]
87 #pred.qe6 <- pred.qe6[,predicted:=.(apply(pred.qe6[,.(idx.max)],1,function(idx) c("stub","start","c","b",'a',"ga","fa")[idx]))]
88 pred.qe6 <- pred.qe6[,predicted:=.(apply(pred.qe6[,.(idx.max)],1,function(idx) c("stub","start","c","b","ga","fa")[idx]))]
89
90 test.df <- test.df[,ordinal.pred := ordinal.preds$predicted]
91 test.df <- test.df[,pred.qe6 := pred.qe6$predicted]
92 test.df <- test.df[,idx.max:=.(apply(test.df[,.(Stub,Start,C,B,GA,FA)],1,which.max))]
93 test.df <- test.df[,MPQC:=.(apply(test.df[,.(idx.max)],1,function(idx) c("stub","start","c","b","ga","fa")[idx]))]
94
95 (top.pred.accuracy <- weighted.mean(test.df[,.(MPQC)] == test.df[,.(wp10)],test.df$article_weight))
96 remember(top.pred.accuracy, "top.pred.accuracy")
97 (ordinal.pred.accuracy <- weighted.mean(test.df[,.(ordinal.pred)] == test.df[,.(wp10)], test.df$article_weight))
98 remember(ordinal.pred.accuracy, "ordinal.pred.accuracy")
99 quality.even6 <- apply(df[,.(Stub,Start,B,C,GA,FA)],1,function(r) r %*% c(1,2,3,4,5,6))
100 (pred.qe6.accuracy <- weighted.mean(test.df[,.(pred.qe6)] == test.df[,.(wp10)], test.df$article_weight))
101 remember(ordinal.pred.accuracy, "ordinal.pred.accuracy")
102 remember(best.model, "best.model")
103 remember(test.df,'test.df')
104
105 ordinal.preds[,wp10:=test.df$wp10]
106 ordinal.preds[,weight:=test.df$article_weight]
107 total.weight <- sum(ordinal.preds$weight)
108 library(modi)
109 print("Calibration stats 1")
110 calibration.stats.1 <- ordinal.preds[,.(prob.predicted=apply(.SD,2,function(c) weighted.mean(c,weight)),
111                                       var.predicted=apply(.SD,2,function(c) weighted.var(c,weight))),.SDcols=c("Stub","Start","C","B","GA","FA")]
112
113 calibration.stats.1[,wp10:=c("stub","start","c","b","ga","fa")]
114 calip.data = ordinal.preds[order(wp10),.(prob.data=sum(weight)/total.weight,
115                                          var.data=var(weight)/total.weight),by=.(wp10)]
116
117 calibration.stats.1 <- calibration.stats.1[calip.data,on=.(wp10)]
118
119 calibration.stats.1$weighttype <- 'Article weight'
120
121 ordinal.preds[,weight:=test.df$revision_weight]
122 total.weight <- sum(ordinal.preds$weight)
123 print("Calibration stats 2")
124 calibration.stats.2 <- ordinal.preds[,.(prob.predicted=apply(.SD,2,function(c) weighted.mean(c,weight)),
125                                         var.predicted=apply(.SD,2,function(c) weighted.var(c,weight))),.SDcols=c("Stub","Start","C","B","GA","FA")]
126
127
128 calibration.stats.2[,wp10:=c("stub","start","c","b","ga","fa")]
129 calip.data = ordinal.preds[order(wp10),.(prob.data=sum(weight)/total.weight,
130                                          var.data=var(weight)/total.weight),by=.(wp10)]
131
132 calibration.stats.2 <- calibration.stats.2[calip.data,on=.(wp10)]
133
134 calibration.stats.2$weighttype <- 'Revision weight'
135
136
137 ordinal.preds[,weight:=rep(1,nrow(ordinal.preds))]
138 total.weight <- sum(ordinal.preds$weight)
139 print("Calibration stats 3")
140 calibration.stats.3 <- ordinal.preds[,.(prob.predicted=apply(.SD,2,function(c) weighted.mean(c,weight)),
141                                         var.predicted=apply(.SD,2,function(c) weighted.var(c,weight))),.SDcols=c("Stub","Start","C","B","GA","FA")]
142
143
144 calibration.stats.3[,wp10:=c("stub","start","c","b","ga","fa")]
145 calip.data = ordinal.preds[order(wp10),.(prob.data=sum(weight)/total.weight,
146                                          var.data=var(weight)/total.weight),by=.(wp10)]
147
148 calibration.stats.3 <- calibration.stats.3[calip.data,on=.(wp10)]
149
150 calibration.stats.3$weighttype <- 'No weight'
151
152 calibration.stats <- rbind(calibration.stats.1,calibration.stats.2,calibration.stats.3)
153
154 calibration.stats[,calibration:=prob.data - prob.predicted]
155
156 remember(calibration.stats, "calibration.stats")
157
158
159 ## p <- ggplot(data.frame(quality.ordinal, quality.even6, quality.even5))
160 ## p <- p + geom_point(aes(x=quality.even6,y=quality.ordinal)) + geom_smooth(aes(x=quality.even6,y=quality.ordinal))
161
162 ## print(p)
163 ## dev.off()
164
165 ## post.pred <- posterior_predict(model.main)
166 ## preds <- as.character(predict(polrmodel))
167 ## polrmodel.accuracy <- weighted.mean(preds==df$wp10,df$weight)

Community Data Science Collective || Want to submit a patch?