]> code.communitydata.science - articlequality_ordinal.git/blobdiff - ordinal_quality_models.R
add the rest of the code.
[articlequality_ordinal.git] / ordinal_quality_models.R
diff --git a/ordinal_quality_models.R b/ordinal_quality_models.R
new file mode 100644 (file)
index 0000000..99163bc
--- /dev/null
@@ -0,0 +1,202 @@
+source("RemembR/R/RemembeR.R")
+source("load_data.R")
+change.remember.file("ordinal.quality.model.RDS")
+
+test <- F
+
+remember(weights, "sample.weights")
+
+n.holdout <- 4000
+remember(n.holdout,"n.holdout")
+holdout <- df[sample(.N,n.holdout)]
+saveRDS(holdout,'data/holdout_quality_labels.RDS')
+df <- df[!(revid %in% holdout$revid)]
+saveRDS(df,'data/training_quality_labels.RDS')
+
+if(test == TRUE){
+    df <- df[sample(.N,2000)]
+}
+
+## So it turns out that the 6 predictors we have are highly correlated creating problems for sampling so use QR decomposition
+df <- df[!is.na(wp10)]
+
+df[, start.p.stub := Start + Stub]
+
+saveRDS(df,"data/training_quality_labels.RDS")
+
+## So it turns out that the 6 predictors we have are highly correlated creating problems for sampling so use QR decomposition
+df <- df[!is.na(wp10)]
+
+df[, start.p.stub := Start + Stub]
+
+fam.cloglog <- sratio(link='cloglog', threshold='flexible')
+#formula.1 <- brmsformula(wp10 | weights(weight) ~  1,decomp='QR',center=TRUE)
+
+fam <- sratio(link='logit', threshold='flexible')
+fam.cumulative <- sratio(link='logit', threshold='flexible')
+
+## It turns out that the matrix is singular if we include all the predictors.
+## C is the most correlated with the other variables so for now let's remove it.
+
+## it turns out that we don't need to do model selection at all since we don't care about the coefficients.
+## we can just do the csv!
+x <- df[,.(Stub,Start,C,B,GA,FA)]
+
+wpca <- function(x, weight){
+    name <- names(x)
+    x <- as.matrix(x)
+    means <- unlist(lapply(1:dim(x)[2], function(i) weighted.mean(x[,i], weight)))
+    names(means) <- name
+    centered <- as.matrix(t(t(x) - means))
+    weightmat <- diag(weight)
+    covmat <- t(centered) %*% weightmat %*% centered / (sum(weight) - 1)
+
+    factors <- eigen(covmat)
+    basis <- factors$vectors
+    result <- centered %*% basis
+    # return a list with the info we need to do the transformation
+    return(list(means=means, basis=basis, x=result))
+}
+
+#unweighted.pca <- wpca(df[,.(Stub,Start,C,B,GA,FA)],rep(1,nrow(df)))
+upca <- prcomp(df[,.(Stub,Start,C,B,GA,FA)])
+unweighted.pca <- list(means = upca$center, basis=upca$rotation, x=upca$x)
+saveRDS(unweighted.pca,"data/ores_pca_features.noweights.RDS")
+
+weighted.pca <- wpca(df[,.(Stub,Start,C,B,GA,FA)],df$article_weight)
+saveRDS(weighted.pca, "data/ores_pca_features.RDS")
+
+revision.pca <- wpca(df[,.(Stub,Start,C,B,GA,FA)],df$revision_weight)
+saveRDS(revision.pca, "data/ores_pca_features_revisions.RDS")
+
+df <- df[,":="(pca1 = weighted.pca$x[,1],
+               pca2 = weighted.pca$x[,2],
+               pca3 = weighted.pca$x[,3],
+               pca4 = weighted.pca$x[,4],
+               pca5 = weighted.pca$x[,5],
+               pca6 = weighted.pca$x[,6])]
+
+df <- df[,":="(pca1.revision = revision.pca$x[,1],
+               pca2.revision = revision.pca$x[,2],
+               pca3.revision = revision.pca$x[,3],
+               pca4.revision = revision.pca$x[,4],
+               pca5.revision = revision.pca$x[,5],
+               pca6.revision = revision.pca$x[,6])]
+
+df <- df[,":="(pca1.noweights = unweighted.pca$x[,1],
+               pca2.noweights = unweighted.pca$x[,2],
+               pca3.noweights = unweighted.pca$x[,3],
+               pca4.noweights = unweighted.pca$x[,4],
+               pca5.noweights = unweighted.pca$x[,5],
+               pca6.noweights = unweighted.pca$x[,6])]
+
+qformula.main.pca.cs <- brmsformula(wp10 | weights(article_weight) ~  cs(pca1) + cs(pca2) + cs(pca3) + cs(pca4) + cs(pca5))
+formula.main.pca.noweights.cs <- brmsformula(wp10  ~  cs(pca1.noweights) + cs(pca2.noweights) + cs(pca3.noweights) + cs(pca4.noweights) + cs(pca5.noweights))
+formula.revision.pca.cs <- brmsformula(wp10 | weights(revision_weight) ~  cs(pca1.revision) + cs(pca2.revision) + cs(pca3.revision) + cs(pca4.revision) + cs(pca5.revision))
+formula.qe6.cs <- brmsformula(wp10 | weights(article_weight) ~ cs(quality.even6))
+formula.qe6.revision.cs <- brmsformula(wp10 | weights(revision_weight) ~ cs(quality.even6))
+formula.qe6.noweights.cs <- brmsformula(wp10 ~ cs(quality.even6))
+
+formula.main.pca <- brmsformula(wp10 | weights(article_weight) ~  pca1 + pca2 + pca3 + pca4 + pca5)
+formula.main.pca.noweights <- brmsformula(wp10  ~  pca1.noweights + pca2.noweights + pca3.noweights + pca4.noweights + pca5.noweights)
+formula.revision.pca <- brmsformula(wp10 | weights(revision_weight) ~  pca1.revision + pca2.revision + pca3.revision + pca4.revision + pca5.revision)
+formula.qe6 <- brmsformula(wp10 | weights(article_weight) ~ quality.even6)
+formula.qe6.revision <- brmsformula(wp10 | weights(revision_weight) ~ quality.even6)
+formula.qe6.noweights <- brmsformula(wp10 ~ quality.even6)
+
+formula.scores.noweights <- brmsformula(wp10 ~ Start + Stub + GA + FA + B)
+                                        
+
+library(future)
+library(parallel)
+options(mc.cores = parallel::detectCores())
+
+plan(lapply(1:7,function(x) tweak(multisession, workers=4)))
+
+model.main.pca %<-% brm(formula=formula.main.pca, data=df, family=fam, control=list(max_treedepth=15), future=TRUE, save_pars=save_pars(all=TRUE))
+model.qe6 %<-% brm(formula.qe6, data=df, family=fam, control=list(max_treedepth=15),future=TRUE,save_pars=save_pars(all=TRUE))
+
+model.main.revision %<-% brm(formula=formula.revision.pca, data=df, family=fam, control=list(max_treedepth=15), future=TRUE, save_pars=save_pars(all=TRUE))
+
+model.qe6.revision %<-% brm(formula.qe6.revision, data=df, family=fam, control=list(max_treedepth=15),future=TRUE,save_pars=save_pars(all=TRUE))
+model.qe6.noweights %<-% brm(formula.qe6.noweights, data=df, family=fam, control=list(max_treedepth=15),future=TRUE,save_pars=save_pars(all=TRUE))
+model.main.pca.noweights %<-% brm(formula=formula.main.pca.noweights, data=df, family=fam, control=list(max_treedepth=15), future=TRUE,save_pars=save_pars(all=TRUE))
+
+## model.main.pca.cs %<-% brm(formula=formula.main.pca.cs, data=df, family=fam, control=list(max_treedepth=15), future=TRUE, save_pars=save_pars(all=TRUE))
+## model.qe6.cs %<-% brm(formula.qe6.cs, data=df, family=fam, control=list(max_treedepth=15),future=TRUE,save_pars=save_pars(all=TRUE))
+
+## model.main.revision.cs %<-% brm(formula=formula.revision.pca.cs, data=df, family=fam, control=list(max_treedepth=15), future=TRUE, save_pars=save_pars(all=TRUE))
+
+## model.qe6.revision.cs %<-% brm(formula.qe6.revision.cs, data=df, family=fam, control=list(max_treedepth=15),future=TRUE,save_pars=save_pars(all=TRUE))
+## model.qe6.noweights.cs %<-% brm(formula.qe6.noweights.cs, data=df, family=fam, control=list(max_treedepth=15),future=TRUE,save_pars=save_pars(all=TRUE))
+## model.main.pca.noweights.cs %<-% brm(formula=formula.main.pca.noweights.cs, data=df, family=fam, control=list(max_treedepth=15), future=TRUE,save_pars=save_pars(all=TRUE))
+
+model.main.pca.cumulative %<-% brm(formula=formula.main.pca, data=df, family=fam.cumulative, control=list(max_treedepth=15), future=TRUE, save_pars=save_pars(all=TRUE))
+model.qe6.cumulative %<-% brm(formula.qe6, data=df, family=fam.cumulative, control=list(max_treedepth=15),future=TRUE,save_pars=save_pars(all=TRUE))
+
+model.main.revision.cumulative %<-% brm(formula=formula.revision.pca, data=df, family=fam.cumulative, control=list(max_treedepth=15), future=TRUE, save_pars=save_pars(all=TRUE))
+
+model.qe6.revision.cumulative %<-% brm(formula.qe6.revision, data=df, family=fam.cumulative, control=list(max_treedepth=15),future=TRUE,save_pars=save_pars(all=TRUE))
+model.qe6.noweights.cumulative %<-% brm(formula.qe6.noweights, data=df, family=fam.cumulative, control=list(max_treedepth=15),future=TRUE,save_pars=save_pars(all=TRUE))
+model.main.pca.noweights.cumulative %<-% brm(formula=formula.main.pca.noweights, data=df, family=fam.cumulative, control=list(max_treedepth=15), future=TRUE,save_pars=save_pars(all=TRUE))
+
+
+#model.scores.noweights <- brm(formula=formula.scores.noweights, data=df, family=fam, control=list(max_treedepth=15), future=TRUE,save_pars=save_pars(all=TRUE))
+
+models <- resolve(globalenv(),result=F) 
+print("adding loo")
+
+model.main.revision <- add_criterion(model.main.revision,'loo',moment_match=T)
+model.main.pca <- add_criterion(model.main.pca,'loo',moment_match=T)
+model.qe6.revision <- add_criterion(model.qe6.revision,'loo')
+model.qe6 <- add_criterion(model.qe6,'loo')
+model.main.pca.noweights <- add_criterion(model.main.pca.noweights,'loo',moment_match=T)
+model.qe6.noweights <- add_criterion(model.qe6.noweights,'loo')
+
+model.main.revision.cumulative <- add_criterion(model.main.revision.cumulative,'loo',moment_match=T)
+model.main.pca.cumulative <- add_criterion(model.main.pca.cumulative,'loo',moment_match=T)
+model.qe6.revision.cumulative <- add_criterion(model.qe6.revision.cumulative,'loo')
+model.qe6.cumulative <- add_criterion(model.qe6.cumulative,'loo')
+model.main.pca.noweights.cumulative <- add_criterion(model.main.pca.noweights.cumulative,'loo',moment_match=T)
+model.qe6.noweights.cumulative <- add_criterion(model.qe6.noweights.cumulative,'loo')
+
+
+## model.main.revision.cs <- add_criterion(model.main.revision.cs,'loo',moment_match=T)
+## model.main.pca.cs <- add_criterion(model.main.pca.cs,'loo',moment_match=T)
+## model.qe6.revision.cs <- add_criterion(model.qe6.revision.cs,'loo')
+## model.qe6.cs <- add_criterion(model.qe6.cs,'loo')
+## model.main.pca.noweights.cs <- add_criterion(model.main.pca.noweights.cs,'loo',moment_match=T)
+## model.qe6.noweights.cs <- add_criterion(model.qe6.noweights.cs,'loo')
+
+saveRDS(model.qe6.revision,"models/ordinal_quality_qe6_revision.RDS")
+saveRDS(model.qe6,"models/ordinal_quality_qe6.RDS")
+saveRDS(model.main.pca.noweights,"models/ordinal_quality_pca.noweights.RDS")
+saveRDS(model.qe6.noweights,"models/ordinal_quality_qe6.noweights.RDS")
+saveRDS(model.main.pca,"models/ordinal_quality_pca.RDS")
+saveRDS(model.main.revision,"models/ordinal_quality_pca_revision.RDS")
+
+saveRDS(model.qe6.revision.cumulative,"models/ordinal_quality_qe6_revision.cumulative.RDS")
+saveRDS(model.qe6.cumulative,"models/ordinal_quality_qe6.cumulative.RDS")
+saveRDS(model.main.pca.noweights.cumulative,"models/ordinal_quality_pca.noweights.cumulative.RDS")
+saveRDS(model.qe6.noweights.cumulative,"models/ordinal_quality_qe6.noweights.cumulative.RDS")
+saveRDS(model.main.pca.cumulative,"models/ordinal_quality_pca.cumulative.RDS")
+saveRDS(model.main.revision.cumulative,"models/ordinal_quality_pca_revision.cumulative.RDS")
+
+## saveRDS(model.qe6.revision.cs,"models/ordinal_quality_qe6_revision.RDS")
+## saveRDS(model.qe6.cs,"models/ordinal_quality_qe6.RDS")
+## saveRDS(model.main.pca.noweights.cs,"models/ordinal_quality_pca.noweights.RDS")
+## saveRDS(model.qe6.noweights.cs,"models/ordinal_quality_qe6.noweights.RDS")
+## saveRDS(model.main.pca.cs,"models/ordinal_quality_pca.RDS")
+## saveRDS(model.main.revision.cs,"models/ordinal_quality_pca_revision.RDS")
+
+
+
+models <- resolve(globalenv(),result=F) 
+
+
+
+
+
+
+
+

Community Data Science Collective || Want to submit a patch?