X-Git-Url: https://code.communitydata.science/articlequality_ordinal.git/blobdiff_plain/29abd26b97b7666c9b7de4521c4861e50f6a6f2c..2c733a87881c9aa70dcfe9d2c7db697c8eb14886:/load_data.R diff --git a/load_data.R b/load_data.R new file mode 100644 index 0000000..5216a9a --- /dev/null +++ b/load_data.R @@ -0,0 +1,31 @@ +library(MASS) +library(brms) +options(mc.cores=28) + +library(data.table) +library(arrow) + +sample.params <- readRDS("remember_sample_quality_labels.RDS") + +df <- data.table(read_feather("data/scored_article_sample.feather")) +wp10dict <- list('start','stub','c','b','a','ga','fa') +df[,wp10:=wp10dict[wp10]] +df <- df[,wp10:=factor(wp10,levels=c('stub','start','c','b','a','ga','fa'),ordered=TRUE)] +## remove 'a' class articles for a fair comparison. +df <- df[wp10!='a'] +df <- df[,datetime := as.POSIXct(timestamp,format="%Y%m%d%H%M%S")] +df <- df[,datetime.numeric := as.numeric(timestamp)] +df <- df[,datetime.numeric := (datetime.numeric - min(datetime.numeric))] +df <- df[,datetime.numeric := datetime.numeric/max(datetime.numeric)] + +data.counts <- data.table(sample.params$label_sample_counts) +#data.counts <- data.counts[,wp10:=factor(wp10,levels=c('stub','start','c','b','a','ga','fa'),ordered=TRUE)] +data.counts <- data.counts[,wp10:=factor(wp10,levels=c('stub','start','c','b','a','ga','fa'),ordered=TRUE)] +sample.counts <- df[,.(.N),by=.(wp10)][order(wp10)] +#sample.counts <- sample.counts[,wp10:=factor(wp10,levels=c('stub','start','c','b','a','ga','fa'),ordered=TRUE)] +sample.counts <- sample.counts[,wp10:=factor(wp10,levels=c('stub','start','c','b','ga','fa'),ordered=TRUE)] +weights <- data.counts[sample.counts,on=.(wp10)] +weights <- weights[,article_weight:=(n_articles/sum(weights$n_articles))/(N/sum(weights$N))] +weights <- weights[,revision_weight:=(n_revisions/sum(weights$n_revisions))/(N/sum(weights$N))] +df <- df[weights,on=.(wp10)] +df[,quality.even6 := apply(df[,.(Stub,Start,B,C,GA,FA)],1,function(r) r %*% c(1,2,3,4,5,6))]