X-Git-Url: https://code.communitydata.science/articlequality_ordinal.git/blobdiff_plain/29abd26b97b7666c9b7de4521c4861e50f6a6f2c..2c733a87881c9aa70dcfe9d2c7db697c8eb14886:/add_quality_scores.R diff --git a/add_quality_scores.R b/add_quality_scores.R new file mode 100644 index 0000000..be4fbc5 --- /dev/null +++ b/add_quality_scores.R @@ -0,0 +1,59 @@ +#!/usr/bin/env Rscript +library(arrow) +library(brms) +library(data.table) +library(ggplot2) +library(parallel) +options(mc.cores=26) +registerDoParallel(cores=26) + +dataset <- as.data.table(read_feather("data/scored_article_sample.feather")) +dataset <- dataset[order(articleid,time_session_end)] +quality_model <- readRDS("models/ordinal_quality_noFa.RDS") +posterior_coefs <- as.data.table(quality_model) + +f <- function(cols){ + post_qual <- as.matrix(posterior_coefs[,.(b_Stub, b_Start, b_C, b_B, b_GA)]) %*% as.numeric(cols) + list(med_quality = median(post_qual), + mean_quality = mean(post_qual), + sd_quality = sd(post_qual) + ) + +} + +cl <- makeForkCluster(nnodes=26) +res <- rbindlist(parApply(cl,dataset[,.(Stub,Start,C,B,GA)],1,f)) +dataset[,names(res):=res] + +f2 <- function(revscores){ + posterior_quality <- as.matrix(posterior_coefs[,.(b_Stub,b_Start,b_C,b_B,b_GA)]) %*% t(as.matrix(revscores)) + posterior_quality_diff <- apply(posterior_quality, 1, function(x) diff(x,1,1)) + posterior_quality_diff2 <- apply(posterior_quality, 1, function(x) diff(x,1,2)) + list( + mean_quality_diff1 = c(NA,apply(posterior_quality_diff,1,mean)), + sd_quality_diff1 = c(NA,apply(posterior_quality_diff,1,sd)), + median_quality_diff1 = c(NA,apply(posterior_quality_diff,1,median)), + mean_quality_diff2 = c(c(NA,NA),apply(posterior_quality_diff2,1,mean)), + sd_quality_diff2 = c(c(NA,NA),apply(posterior_quality_diff2,1,sd)), + median_quality_diff2 = c(c(NA,NA),apply(posterior_quality_diff2,1,median))) +} + + +dataset[,c("mean_qual_diff1","sd_qual_diff1","median_qual_diff1","mean_qual_diff2","sd_qual_diff2","median_qual_diff2"):=f2(.SD),by=.(articleid),.SDcols=c("Stub","Start","C","B","GA")] + +write_feather(dataset,'data/ordinal_scored_article_sample.feather') + +## in an earlier version I computed the full posterior of quality for the dataset, but it took too much memory. +## Lines below checked (and confirmed) that posteriors were approximately normal. +## we can check that the means and the medians are close as a clue that normality is a good assumptoin +## mean(med_quality/mean_quality) +## mean(med_quality - mean_quality) +## mean((med_quality - mean_quality)^2) + +## ## plot some of the posteriors to check. +## quality_post <- dataset[1:8] +## quality_post <- melt(quality_post) +## p <- ggplot(quality_post, aes(x=value,group=variable)) + geom_histogram(bins=50) + facet_wrap(.~variable) +## ggsave("plots/quality_posterior_normality.pdf",device='pdf') + +