]> code.communitydata.science - articlequality_ordinal.git/blob - ordinal_quality_models.R
add the rest of the code.
[articlequality_ordinal.git] / ordinal_quality_models.R
1 source("RemembR/R/RemembeR.R")
2 source("load_data.R")
3 change.remember.file("ordinal.quality.model.RDS")
4
5 test <- F
6
7 remember(weights, "sample.weights")
8
9 n.holdout <- 4000
10 remember(n.holdout,"n.holdout")
11 holdout <- df[sample(.N,n.holdout)]
12 saveRDS(holdout,'data/holdout_quality_labels.RDS')
13 df <- df[!(revid %in% holdout$revid)]
14 saveRDS(df,'data/training_quality_labels.RDS')
15
16 if(test == TRUE){
17     df <- df[sample(.N,2000)]
18 }
19
20 ## So it turns out that the 6 predictors we have are highly correlated creating problems for sampling so use QR decomposition
21 df <- df[!is.na(wp10)]
22
23 df[, start.p.stub := Start + Stub]
24
25 saveRDS(df,"data/training_quality_labels.RDS")
26
27 ## So it turns out that the 6 predictors we have are highly correlated creating problems for sampling so use QR decomposition
28 df <- df[!is.na(wp10)]
29
30 df[, start.p.stub := Start + Stub]
31
32 fam.cloglog <- sratio(link='cloglog', threshold='flexible')
33 #formula.1 <- brmsformula(wp10 | weights(weight) ~  1,decomp='QR',center=TRUE)
34
35 fam <- sratio(link='logit', threshold='flexible')
36 fam.cumulative <- sratio(link='logit', threshold='flexible')
37
38 ## It turns out that the matrix is singular if we include all the predictors.
39 ## C is the most correlated with the other variables so for now let's remove it.
40
41 ## it turns out that we don't need to do model selection at all since we don't care about the coefficients.
42 ## we can just do the csv!
43 x <- df[,.(Stub,Start,C,B,GA,FA)]
44
45 wpca <- function(x, weight){
46     name <- names(x)
47     x <- as.matrix(x)
48     means <- unlist(lapply(1:dim(x)[2], function(i) weighted.mean(x[,i], weight)))
49     names(means) <- name
50     centered <- as.matrix(t(t(x) - means))
51     weightmat <- diag(weight)
52     covmat <- t(centered) %*% weightmat %*% centered / (sum(weight) - 1)
53
54     factors <- eigen(covmat)
55     basis <- factors$vectors
56     result <- centered %*% basis
57     # return a list with the info we need to do the transformation
58     return(list(means=means, basis=basis, x=result))
59 }
60
61 #unweighted.pca <- wpca(df[,.(Stub,Start,C,B,GA,FA)],rep(1,nrow(df)))
62 upca <- prcomp(df[,.(Stub,Start,C,B,GA,FA)])
63 unweighted.pca <- list(means = upca$center, basis=upca$rotation, x=upca$x)
64 saveRDS(unweighted.pca,"data/ores_pca_features.noweights.RDS")
65
66 weighted.pca <- wpca(df[,.(Stub,Start,C,B,GA,FA)],df$article_weight)
67 saveRDS(weighted.pca, "data/ores_pca_features.RDS")
68
69 revision.pca <- wpca(df[,.(Stub,Start,C,B,GA,FA)],df$revision_weight)
70 saveRDS(revision.pca, "data/ores_pca_features_revisions.RDS")
71
72 df <- df[,":="(pca1 = weighted.pca$x[,1],
73                pca2 = weighted.pca$x[,2],
74                pca3 = weighted.pca$x[,3],
75                pca4 = weighted.pca$x[,4],
76                pca5 = weighted.pca$x[,5],
77                pca6 = weighted.pca$x[,6])]
78
79 df <- df[,":="(pca1.revision = revision.pca$x[,1],
80                pca2.revision = revision.pca$x[,2],
81                pca3.revision = revision.pca$x[,3],
82                pca4.revision = revision.pca$x[,4],
83                pca5.revision = revision.pca$x[,5],
84                pca6.revision = revision.pca$x[,6])]
85
86 df <- df[,":="(pca1.noweights = unweighted.pca$x[,1],
87                pca2.noweights = unweighted.pca$x[,2],
88                pca3.noweights = unweighted.pca$x[,3],
89                pca4.noweights = unweighted.pca$x[,4],
90                pca5.noweights = unweighted.pca$x[,5],
91                pca6.noweights = unweighted.pca$x[,6])]
92
93 qformula.main.pca.cs <- brmsformula(wp10 | weights(article_weight) ~  cs(pca1) + cs(pca2) + cs(pca3) + cs(pca4) + cs(pca5))
94 formula.main.pca.noweights.cs <- brmsformula(wp10  ~  cs(pca1.noweights) + cs(pca2.noweights) + cs(pca3.noweights) + cs(pca4.noweights) + cs(pca5.noweights))
95 formula.revision.pca.cs <- brmsformula(wp10 | weights(revision_weight) ~  cs(pca1.revision) + cs(pca2.revision) + cs(pca3.revision) + cs(pca4.revision) + cs(pca5.revision))
96 formula.qe6.cs <- brmsformula(wp10 | weights(article_weight) ~ cs(quality.even6))
97 formula.qe6.revision.cs <- brmsformula(wp10 | weights(revision_weight) ~ cs(quality.even6))
98 formula.qe6.noweights.cs <- brmsformula(wp10 ~ cs(quality.even6))
99
100 formula.main.pca <- brmsformula(wp10 | weights(article_weight) ~  pca1 + pca2 + pca3 + pca4 + pca5)
101 formula.main.pca.noweights <- brmsformula(wp10  ~  pca1.noweights + pca2.noweights + pca3.noweights + pca4.noweights + pca5.noweights)
102 formula.revision.pca <- brmsformula(wp10 | weights(revision_weight) ~  pca1.revision + pca2.revision + pca3.revision + pca4.revision + pca5.revision)
103 formula.qe6 <- brmsformula(wp10 | weights(article_weight) ~ quality.even6)
104 formula.qe6.revision <- brmsformula(wp10 | weights(revision_weight) ~ quality.even6)
105 formula.qe6.noweights <- brmsformula(wp10 ~ quality.even6)
106
107 formula.scores.noweights <- brmsformula(wp10 ~ Start + Stub + GA + FA + B)
108                                         
109
110 library(future)
111 library(parallel)
112 options(mc.cores = parallel::detectCores())
113
114 plan(lapply(1:7,function(x) tweak(multisession, workers=4)))
115
116 model.main.pca %<-% brm(formula=formula.main.pca, data=df, family=fam, control=list(max_treedepth=15), future=TRUE, save_pars=save_pars(all=TRUE))
117 model.qe6 %<-% brm(formula.qe6, data=df, family=fam, control=list(max_treedepth=15),future=TRUE,save_pars=save_pars(all=TRUE))
118
119 model.main.revision %<-% brm(formula=formula.revision.pca, data=df, family=fam, control=list(max_treedepth=15), future=TRUE, save_pars=save_pars(all=TRUE))
120
121 model.qe6.revision %<-% brm(formula.qe6.revision, data=df, family=fam, control=list(max_treedepth=15),future=TRUE,save_pars=save_pars(all=TRUE))
122 model.qe6.noweights %<-% brm(formula.qe6.noweights, data=df, family=fam, control=list(max_treedepth=15),future=TRUE,save_pars=save_pars(all=TRUE))
123 model.main.pca.noweights %<-% brm(formula=formula.main.pca.noweights, data=df, family=fam, control=list(max_treedepth=15), future=TRUE,save_pars=save_pars(all=TRUE))
124
125 ## model.main.pca.cs %<-% brm(formula=formula.main.pca.cs, data=df, family=fam, control=list(max_treedepth=15), future=TRUE, save_pars=save_pars(all=TRUE))
126 ## model.qe6.cs %<-% brm(formula.qe6.cs, data=df, family=fam, control=list(max_treedepth=15),future=TRUE,save_pars=save_pars(all=TRUE))
127
128 ## model.main.revision.cs %<-% brm(formula=formula.revision.pca.cs, data=df, family=fam, control=list(max_treedepth=15), future=TRUE, save_pars=save_pars(all=TRUE))
129
130 ## model.qe6.revision.cs %<-% brm(formula.qe6.revision.cs, data=df, family=fam, control=list(max_treedepth=15),future=TRUE,save_pars=save_pars(all=TRUE))
131 ## model.qe6.noweights.cs %<-% brm(formula.qe6.noweights.cs, data=df, family=fam, control=list(max_treedepth=15),future=TRUE,save_pars=save_pars(all=TRUE))
132 ## model.main.pca.noweights.cs %<-% brm(formula=formula.main.pca.noweights.cs, data=df, family=fam, control=list(max_treedepth=15), future=TRUE,save_pars=save_pars(all=TRUE))
133
134 model.main.pca.cumulative %<-% brm(formula=formula.main.pca, data=df, family=fam.cumulative, control=list(max_treedepth=15), future=TRUE, save_pars=save_pars(all=TRUE))
135 model.qe6.cumulative %<-% brm(formula.qe6, data=df, family=fam.cumulative, control=list(max_treedepth=15),future=TRUE,save_pars=save_pars(all=TRUE))
136
137 model.main.revision.cumulative %<-% brm(formula=formula.revision.pca, data=df, family=fam.cumulative, control=list(max_treedepth=15), future=TRUE, save_pars=save_pars(all=TRUE))
138
139 model.qe6.revision.cumulative %<-% brm(formula.qe6.revision, data=df, family=fam.cumulative, control=list(max_treedepth=15),future=TRUE,save_pars=save_pars(all=TRUE))
140 model.qe6.noweights.cumulative %<-% brm(formula.qe6.noweights, data=df, family=fam.cumulative, control=list(max_treedepth=15),future=TRUE,save_pars=save_pars(all=TRUE))
141 model.main.pca.noweights.cumulative %<-% brm(formula=formula.main.pca.noweights, data=df, family=fam.cumulative, control=list(max_treedepth=15), future=TRUE,save_pars=save_pars(all=TRUE))
142
143
144 #model.scores.noweights <- brm(formula=formula.scores.noweights, data=df, family=fam, control=list(max_treedepth=15), future=TRUE,save_pars=save_pars(all=TRUE))
145
146 models <- resolve(globalenv(),result=F) 
147 print("adding loo")
148
149 model.main.revision <- add_criterion(model.main.revision,'loo',moment_match=T)
150 model.main.pca <- add_criterion(model.main.pca,'loo',moment_match=T)
151 model.qe6.revision <- add_criterion(model.qe6.revision,'loo')
152 model.qe6 <- add_criterion(model.qe6,'loo')
153 model.main.pca.noweights <- add_criterion(model.main.pca.noweights,'loo',moment_match=T)
154 model.qe6.noweights <- add_criterion(model.qe6.noweights,'loo')
155
156 model.main.revision.cumulative <- add_criterion(model.main.revision.cumulative,'loo',moment_match=T)
157 model.main.pca.cumulative <- add_criterion(model.main.pca.cumulative,'loo',moment_match=T)
158 model.qe6.revision.cumulative <- add_criterion(model.qe6.revision.cumulative,'loo')
159 model.qe6.cumulative <- add_criterion(model.qe6.cumulative,'loo')
160 model.main.pca.noweights.cumulative <- add_criterion(model.main.pca.noweights.cumulative,'loo',moment_match=T)
161 model.qe6.noweights.cumulative <- add_criterion(model.qe6.noweights.cumulative,'loo')
162
163
164 ## model.main.revision.cs <- add_criterion(model.main.revision.cs,'loo',moment_match=T)
165 ## model.main.pca.cs <- add_criterion(model.main.pca.cs,'loo',moment_match=T)
166 ## model.qe6.revision.cs <- add_criterion(model.qe6.revision.cs,'loo')
167 ## model.qe6.cs <- add_criterion(model.qe6.cs,'loo')
168 ## model.main.pca.noweights.cs <- add_criterion(model.main.pca.noweights.cs,'loo',moment_match=T)
169 ## model.qe6.noweights.cs <- add_criterion(model.qe6.noweights.cs,'loo')
170
171 saveRDS(model.qe6.revision,"models/ordinal_quality_qe6_revision.RDS")
172 saveRDS(model.qe6,"models/ordinal_quality_qe6.RDS")
173 saveRDS(model.main.pca.noweights,"models/ordinal_quality_pca.noweights.RDS")
174 saveRDS(model.qe6.noweights,"models/ordinal_quality_qe6.noweights.RDS")
175 saveRDS(model.main.pca,"models/ordinal_quality_pca.RDS")
176 saveRDS(model.main.revision,"models/ordinal_quality_pca_revision.RDS")
177
178 saveRDS(model.qe6.revision.cumulative,"models/ordinal_quality_qe6_revision.cumulative.RDS")
179 saveRDS(model.qe6.cumulative,"models/ordinal_quality_qe6.cumulative.RDS")
180 saveRDS(model.main.pca.noweights.cumulative,"models/ordinal_quality_pca.noweights.cumulative.RDS")
181 saveRDS(model.qe6.noweights.cumulative,"models/ordinal_quality_qe6.noweights.cumulative.RDS")
182 saveRDS(model.main.pca.cumulative,"models/ordinal_quality_pca.cumulative.RDS")
183 saveRDS(model.main.revision.cumulative,"models/ordinal_quality_pca_revision.cumulative.RDS")
184
185 ## saveRDS(model.qe6.revision.cs,"models/ordinal_quality_qe6_revision.RDS")
186 ## saveRDS(model.qe6.cs,"models/ordinal_quality_qe6.RDS")
187 ## saveRDS(model.main.pca.noweights.cs,"models/ordinal_quality_pca.noweights.RDS")
188 ## saveRDS(model.qe6.noweights.cs,"models/ordinal_quality_qe6.noweights.RDS")
189 ## saveRDS(model.main.pca.cs,"models/ordinal_quality_pca.RDS")
190 ## saveRDS(model.main.revision.cs,"models/ordinal_quality_pca_revision.RDS")
191
192
193
194 models <- resolve(globalenv(),result=F) 
195
196
197
198
199
200
201
202

Community Data Science Collective || Want to submit a patch?