X-Git-Url: https://code.communitydata.science/articlequality_ordinal.git/blobdiff_plain/29abd26b97b7666c9b7de4521c4861e50f6a6f2c..2c733a87881c9aa70dcfe9d2c7db697c8eb14886:/ordinal_quality_models.R diff --git a/ordinal_quality_models.R b/ordinal_quality_models.R new file mode 100644 index 0000000..99163bc --- /dev/null +++ b/ordinal_quality_models.R @@ -0,0 +1,202 @@ +source("RemembR/R/RemembeR.R") +source("load_data.R") +change.remember.file("ordinal.quality.model.RDS") + +test <- F + +remember(weights, "sample.weights") + +n.holdout <- 4000 +remember(n.holdout,"n.holdout") +holdout <- df[sample(.N,n.holdout)] +saveRDS(holdout,'data/holdout_quality_labels.RDS') +df <- df[!(revid %in% holdout$revid)] +saveRDS(df,'data/training_quality_labels.RDS') + +if(test == TRUE){ + df <- df[sample(.N,2000)] +} + +## So it turns out that the 6 predictors we have are highly correlated creating problems for sampling so use QR decomposition +df <- df[!is.na(wp10)] + +df[, start.p.stub := Start + Stub] + +saveRDS(df,"data/training_quality_labels.RDS") + +## So it turns out that the 6 predictors we have are highly correlated creating problems for sampling so use QR decomposition +df <- df[!is.na(wp10)] + +df[, start.p.stub := Start + Stub] + +fam.cloglog <- sratio(link='cloglog', threshold='flexible') +#formula.1 <- brmsformula(wp10 | weights(weight) ~ 1,decomp='QR',center=TRUE) + +fam <- sratio(link='logit', threshold='flexible') +fam.cumulative <- sratio(link='logit', threshold='flexible') + +## It turns out that the matrix is singular if we include all the predictors. +## C is the most correlated with the other variables so for now let's remove it. + +## it turns out that we don't need to do model selection at all since we don't care about the coefficients. +## we can just do the csv! +x <- df[,.(Stub,Start,C,B,GA,FA)] + +wpca <- function(x, weight){ + name <- names(x) + x <- as.matrix(x) + means <- unlist(lapply(1:dim(x)[2], function(i) weighted.mean(x[,i], weight))) + names(means) <- name + centered <- as.matrix(t(t(x) - means)) + weightmat <- diag(weight) + covmat <- t(centered) %*% weightmat %*% centered / (sum(weight) - 1) + + factors <- eigen(covmat) + basis <- factors$vectors + result <- centered %*% basis + # return a list with the info we need to do the transformation + return(list(means=means, basis=basis, x=result)) +} + +#unweighted.pca <- wpca(df[,.(Stub,Start,C,B,GA,FA)],rep(1,nrow(df))) +upca <- prcomp(df[,.(Stub,Start,C,B,GA,FA)]) +unweighted.pca <- list(means = upca$center, basis=upca$rotation, x=upca$x) +saveRDS(unweighted.pca,"data/ores_pca_features.noweights.RDS") + +weighted.pca <- wpca(df[,.(Stub,Start,C,B,GA,FA)],df$article_weight) +saveRDS(weighted.pca, "data/ores_pca_features.RDS") + +revision.pca <- wpca(df[,.(Stub,Start,C,B,GA,FA)],df$revision_weight) +saveRDS(revision.pca, "data/ores_pca_features_revisions.RDS") + +df <- df[,":="(pca1 = weighted.pca$x[,1], + pca2 = weighted.pca$x[,2], + pca3 = weighted.pca$x[,3], + pca4 = weighted.pca$x[,4], + pca5 = weighted.pca$x[,5], + pca6 = weighted.pca$x[,6])] + +df <- df[,":="(pca1.revision = revision.pca$x[,1], + pca2.revision = revision.pca$x[,2], + pca3.revision = revision.pca$x[,3], + pca4.revision = revision.pca$x[,4], + pca5.revision = revision.pca$x[,5], + pca6.revision = revision.pca$x[,6])] + +df <- df[,":="(pca1.noweights = unweighted.pca$x[,1], + pca2.noweights = unweighted.pca$x[,2], + pca3.noweights = unweighted.pca$x[,3], + pca4.noweights = unweighted.pca$x[,4], + pca5.noweights = unweighted.pca$x[,5], + pca6.noweights = unweighted.pca$x[,6])] + +qformula.main.pca.cs <- brmsformula(wp10 | weights(article_weight) ~ cs(pca1) + cs(pca2) + cs(pca3) + cs(pca4) + cs(pca5)) +formula.main.pca.noweights.cs <- brmsformula(wp10 ~ cs(pca1.noweights) + cs(pca2.noweights) + cs(pca3.noweights) + cs(pca4.noweights) + cs(pca5.noweights)) +formula.revision.pca.cs <- brmsformula(wp10 | weights(revision_weight) ~ cs(pca1.revision) + cs(pca2.revision) + cs(pca3.revision) + cs(pca4.revision) + cs(pca5.revision)) +formula.qe6.cs <- brmsformula(wp10 | weights(article_weight) ~ cs(quality.even6)) +formula.qe6.revision.cs <- brmsformula(wp10 | weights(revision_weight) ~ cs(quality.even6)) +formula.qe6.noweights.cs <- brmsformula(wp10 ~ cs(quality.even6)) + +formula.main.pca <- brmsformula(wp10 | weights(article_weight) ~ pca1 + pca2 + pca3 + pca4 + pca5) +formula.main.pca.noweights <- brmsformula(wp10 ~ pca1.noweights + pca2.noweights + pca3.noweights + pca4.noweights + pca5.noweights) +formula.revision.pca <- brmsformula(wp10 | weights(revision_weight) ~ pca1.revision + pca2.revision + pca3.revision + pca4.revision + pca5.revision) +formula.qe6 <- brmsformula(wp10 | weights(article_weight) ~ quality.even6) +formula.qe6.revision <- brmsformula(wp10 | weights(revision_weight) ~ quality.even6) +formula.qe6.noweights <- brmsformula(wp10 ~ quality.even6) + +formula.scores.noweights <- brmsformula(wp10 ~ Start + Stub + GA + FA + B) + + +library(future) +library(parallel) +options(mc.cores = parallel::detectCores()) + +plan(lapply(1:7,function(x) tweak(multisession, workers=4))) + +model.main.pca %<-% brm(formula=formula.main.pca, data=df, family=fam, control=list(max_treedepth=15), future=TRUE, save_pars=save_pars(all=TRUE)) +model.qe6 %<-% brm(formula.qe6, data=df, family=fam, control=list(max_treedepth=15),future=TRUE,save_pars=save_pars(all=TRUE)) + +model.main.revision %<-% brm(formula=formula.revision.pca, data=df, family=fam, control=list(max_treedepth=15), future=TRUE, save_pars=save_pars(all=TRUE)) + +model.qe6.revision %<-% brm(formula.qe6.revision, data=df, family=fam, control=list(max_treedepth=15),future=TRUE,save_pars=save_pars(all=TRUE)) +model.qe6.noweights %<-% brm(formula.qe6.noweights, data=df, family=fam, control=list(max_treedepth=15),future=TRUE,save_pars=save_pars(all=TRUE)) +model.main.pca.noweights %<-% brm(formula=formula.main.pca.noweights, data=df, family=fam, control=list(max_treedepth=15), future=TRUE,save_pars=save_pars(all=TRUE)) + +## model.main.pca.cs %<-% brm(formula=formula.main.pca.cs, data=df, family=fam, control=list(max_treedepth=15), future=TRUE, save_pars=save_pars(all=TRUE)) +## model.qe6.cs %<-% brm(formula.qe6.cs, data=df, family=fam, control=list(max_treedepth=15),future=TRUE,save_pars=save_pars(all=TRUE)) + +## model.main.revision.cs %<-% brm(formula=formula.revision.pca.cs, data=df, family=fam, control=list(max_treedepth=15), future=TRUE, save_pars=save_pars(all=TRUE)) + +## model.qe6.revision.cs %<-% brm(formula.qe6.revision.cs, data=df, family=fam, control=list(max_treedepth=15),future=TRUE,save_pars=save_pars(all=TRUE)) +## model.qe6.noweights.cs %<-% brm(formula.qe6.noweights.cs, data=df, family=fam, control=list(max_treedepth=15),future=TRUE,save_pars=save_pars(all=TRUE)) +## model.main.pca.noweights.cs %<-% brm(formula=formula.main.pca.noweights.cs, data=df, family=fam, control=list(max_treedepth=15), future=TRUE,save_pars=save_pars(all=TRUE)) + +model.main.pca.cumulative %<-% brm(formula=formula.main.pca, data=df, family=fam.cumulative, control=list(max_treedepth=15), future=TRUE, save_pars=save_pars(all=TRUE)) +model.qe6.cumulative %<-% brm(formula.qe6, data=df, family=fam.cumulative, control=list(max_treedepth=15),future=TRUE,save_pars=save_pars(all=TRUE)) + +model.main.revision.cumulative %<-% brm(formula=formula.revision.pca, data=df, family=fam.cumulative, control=list(max_treedepth=15), future=TRUE, save_pars=save_pars(all=TRUE)) + +model.qe6.revision.cumulative %<-% brm(formula.qe6.revision, data=df, family=fam.cumulative, control=list(max_treedepth=15),future=TRUE,save_pars=save_pars(all=TRUE)) +model.qe6.noweights.cumulative %<-% brm(formula.qe6.noweights, data=df, family=fam.cumulative, control=list(max_treedepth=15),future=TRUE,save_pars=save_pars(all=TRUE)) +model.main.pca.noweights.cumulative %<-% brm(formula=formula.main.pca.noweights, data=df, family=fam.cumulative, control=list(max_treedepth=15), future=TRUE,save_pars=save_pars(all=TRUE)) + + +#model.scores.noweights <- brm(formula=formula.scores.noweights, data=df, family=fam, control=list(max_treedepth=15), future=TRUE,save_pars=save_pars(all=TRUE)) + +models <- resolve(globalenv(),result=F) +print("adding loo") + +model.main.revision <- add_criterion(model.main.revision,'loo',moment_match=T) +model.main.pca <- add_criterion(model.main.pca,'loo',moment_match=T) +model.qe6.revision <- add_criterion(model.qe6.revision,'loo') +model.qe6 <- add_criterion(model.qe6,'loo') +model.main.pca.noweights <- add_criterion(model.main.pca.noweights,'loo',moment_match=T) +model.qe6.noweights <- add_criterion(model.qe6.noweights,'loo') + +model.main.revision.cumulative <- add_criterion(model.main.revision.cumulative,'loo',moment_match=T) +model.main.pca.cumulative <- add_criterion(model.main.pca.cumulative,'loo',moment_match=T) +model.qe6.revision.cumulative <- add_criterion(model.qe6.revision.cumulative,'loo') +model.qe6.cumulative <- add_criterion(model.qe6.cumulative,'loo') +model.main.pca.noweights.cumulative <- add_criterion(model.main.pca.noweights.cumulative,'loo',moment_match=T) +model.qe6.noweights.cumulative <- add_criterion(model.qe6.noweights.cumulative,'loo') + + +## model.main.revision.cs <- add_criterion(model.main.revision.cs,'loo',moment_match=T) +## model.main.pca.cs <- add_criterion(model.main.pca.cs,'loo',moment_match=T) +## model.qe6.revision.cs <- add_criterion(model.qe6.revision.cs,'loo') +## model.qe6.cs <- add_criterion(model.qe6.cs,'loo') +## model.main.pca.noweights.cs <- add_criterion(model.main.pca.noweights.cs,'loo',moment_match=T) +## model.qe6.noweights.cs <- add_criterion(model.qe6.noweights.cs,'loo') + +saveRDS(model.qe6.revision,"models/ordinal_quality_qe6_revision.RDS") +saveRDS(model.qe6,"models/ordinal_quality_qe6.RDS") +saveRDS(model.main.pca.noweights,"models/ordinal_quality_pca.noweights.RDS") +saveRDS(model.qe6.noweights,"models/ordinal_quality_qe6.noweights.RDS") +saveRDS(model.main.pca,"models/ordinal_quality_pca.RDS") +saveRDS(model.main.revision,"models/ordinal_quality_pca_revision.RDS") + +saveRDS(model.qe6.revision.cumulative,"models/ordinal_quality_qe6_revision.cumulative.RDS") +saveRDS(model.qe6.cumulative,"models/ordinal_quality_qe6.cumulative.RDS") +saveRDS(model.main.pca.noweights.cumulative,"models/ordinal_quality_pca.noweights.cumulative.RDS") +saveRDS(model.qe6.noweights.cumulative,"models/ordinal_quality_qe6.noweights.cumulative.RDS") +saveRDS(model.main.pca.cumulative,"models/ordinal_quality_pca.cumulative.RDS") +saveRDS(model.main.revision.cumulative,"models/ordinal_quality_pca_revision.cumulative.RDS") + +## saveRDS(model.qe6.revision.cs,"models/ordinal_quality_qe6_revision.RDS") +## saveRDS(model.qe6.cs,"models/ordinal_quality_qe6.RDS") +## saveRDS(model.main.pca.noweights.cs,"models/ordinal_quality_pca.noweights.RDS") +## saveRDS(model.qe6.noweights.cs,"models/ordinal_quality_qe6.noweights.RDS") +## saveRDS(model.main.pca.cs,"models/ordinal_quality_pca.RDS") +## saveRDS(model.main.revision.cs,"models/ordinal_quality_pca_revision.RDS") + + + +models <- resolve(globalenv(),result=F) + + + + + + + +