]> code.communitydata.science - articlequality_ordinal.git/blobdiff - add_quality_scores.R
add the rest of the code.
[articlequality_ordinal.git] / add_quality_scores.R
diff --git a/add_quality_scores.R b/add_quality_scores.R
new file mode 100644 (file)
index 0000000..be4fbc5
--- /dev/null
@@ -0,0 +1,59 @@
+#!/usr/bin/env Rscript
+library(arrow)
+library(brms)
+library(data.table)
+library(ggplot2)
+library(parallel)
+options(mc.cores=26)
+registerDoParallel(cores=26)
+
+dataset <- as.data.table(read_feather("data/scored_article_sample.feather"))
+dataset <- dataset[order(articleid,time_session_end)]
+quality_model <- readRDS("models/ordinal_quality_noFa.RDS")
+posterior_coefs <- as.data.table(quality_model)
+
+f <- function(cols){
+    post_qual <- as.matrix(posterior_coefs[,.(b_Stub, b_Start, b_C, b_B, b_GA)]) %*% as.numeric(cols)
+    list(med_quality = median(post_qual),
+         mean_quality = mean(post_qual),
+         sd_quality = sd(post_qual)
+         )
+
+}
+
+cl <- makeForkCluster(nnodes=26)
+res <- rbindlist(parApply(cl,dataset[,.(Stub,Start,C,B,GA)],1,f))
+dataset[,names(res):=res]
+
+f2 <- function(revscores){
+    posterior_quality <- as.matrix(posterior_coefs[,.(b_Stub,b_Start,b_C,b_B,b_GA)]) %*% t(as.matrix(revscores))
+    posterior_quality_diff <- apply(posterior_quality, 1, function(x) diff(x,1,1))
+    posterior_quality_diff2 <- apply(posterior_quality, 1, function(x) diff(x,1,2))
+    list(
+         mean_quality_diff1 = c(NA,apply(posterior_quality_diff,1,mean)),
+         sd_quality_diff1 = c(NA,apply(posterior_quality_diff,1,sd)),
+         median_quality_diff1 = c(NA,apply(posterior_quality_diff,1,median)),
+         mean_quality_diff2 = c(c(NA,NA),apply(posterior_quality_diff2,1,mean)),
+         sd_quality_diff2 = c(c(NA,NA),apply(posterior_quality_diff2,1,sd)),
+         median_quality_diff2 = c(c(NA,NA),apply(posterior_quality_diff2,1,median)))
+}
+
+
+dataset[,c("mean_qual_diff1","sd_qual_diff1","median_qual_diff1","mean_qual_diff2","sd_qual_diff2","median_qual_diff2"):=f2(.SD),by=.(articleid),.SDcols=c("Stub","Start","C","B","GA")]
+
+write_feather(dataset,'data/ordinal_scored_article_sample.feather')
+
+## in an earlier version I computed the full posterior of quality for the dataset, but it took too much memory.
+## Lines below checked (and confirmed) that posteriors were approximately normal. 
+## we can check that the means and the medians are close as a clue that normality is a good assumptoin
+## mean(med_quality/mean_quality)
+## mean(med_quality - mean_quality)
+## mean((med_quality - mean_quality)^2)
+
+## ## plot some of the posteriors to check.
+## quality_post <- dataset[1:8]
+## quality_post <- melt(quality_post)
+## p <- ggplot(quality_post, aes(x=value,group=variable)) + geom_histogram(bins=50) + facet_wrap(.~variable)
+## ggsave("plots/quality_posterior_normality.pdf",device='pdf')
+
+

Community Data Science Collective || Want to submit a patch?