#!/usr/bin/env Rscript # Creates data for plotting # Copyright (C) 2018 Nathan TeBlunthuis # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program. If not, see . library("ggplot2") library("bootstrap") library("scales") source("lib-00-utils.R") if(!exists("newcomers")){ source("01_build_datasets.R") } remember(min(all.edits$date.time),"earliest.data.point") remember(max(all.edits$date.time),"latest.data.point") p1.data <- newcomers[,.(p.reverted = mean(is.reverted), var.reverted=var(is.reverted), p.survives=mean(survives), var.survives=(var(survives)), N=.N), by=.(wiki.name,wiki.age.half.years)] p1.data <- p1.data[N>1] p1.data[,N.wikis := .N, by = .(wiki.age.half.years)] ## put p1 data onto sd scales p1.data[,p.survives.in.sd := p.survives/sd(p.survives),by=.(wiki.name)] p1.data[,p.reverted.in.sd := p.reverted/sd(p.reverted),by=.(wiki.name)] p.data <- melt(p1.data,id.vars=c("wiki.name","wiki.age.half.years"),measure.vars=c("p.survives","p.reverted","p.survives.in.sd","p.reverted.in.sd")) p.stats <- p.data[,as.list(c(setNames(boxplot.stats(value,coef=1.5)$stats,c("min","q1","med","q3","max")), mu=mean(value),N.wikis=.N)),by=.(wiki.age.half.years,variable)] remember(p.stats) p.stats[variable=="p.survives"]$variable="Survives" p.stats[variable=="p.reverted"]$variable="Reverted" remember(cor.test(p1.data$wiki.age.half.years,p1.data$p.survives,method='spearman',alternative='less'),"survives.cor.test") remember(cor.test(p1.data$wiki.age.half.years,p1.data$p.reverted,method='spearman',alternative='greater'),"reverted.cor.test") xlabels = paste0("Year ", 0:max(p.stats$wiki.age.half.years)) p <- ggplot(p.stats,aes(x=as.factor(wiki.age.half.years),ymin=min,lower=q1,middle=med,upper=q3,ymax=max,width=0.3)) p <- p + geom_boxplot(stat='identity') p <- p + geom_line(aes(x=wiki.age.half.years+1,y=med), linetype=2) p <- p + facet_wrap("variable",nrow=2,strip.position="bottom",scales="free") p <- p + scale_y_continuous(name="Proportion of newcomers",minor_breaks=NULL) + scale_x_discrete(name="Wiki age", labels=xlabels) p <- p + theme_bw() + theme(legend.position="None") pdf(width=6,height=6) print(p) dev.off() active.editors <- all.edits[, .(N.edits=.N, wiki.age.years=first(wiki.age.years)), by=.(wiki.name, editor, wiki.age.months)] n.active.editors <- active.editors[N.edits >= 5, .(N.active.editors = .N, wiki.age.years=first(wiki.age.years)), by=.(wiki.name,wiki.age.months)] n.active.editors[, ":="(N=.N), by=.(wiki.age.months)] n.active.editors[,":="(max.age=max(wiki.age.months),max.active.editors=max(N.active.editors),sd.units.active.editors=N.active.editors/sd(N.active.editors)),by="wiki.name"] n.active.editors[,":="(active.editors.pmax=N.active.editors/max.active.editors)] wiki.age.quantile <- .90 max.age.months <- quantile(n.active.editors$max.age,wiki.age.quantile) boot <- n.active.editors[is.finite(sd.units.active.editors)&wiki.age.months <= max.age.months,.(thetastar = bootstrap(x=sd.units.active.editors,nboot=5000,mean)$thetastar),by=.(wiki.age.months)] boot.ci <- boot[,as.list(quantile(thetastar,probs=c(0.025,0.975))),by=.(wiki.age.months)] names(boot.ci) <- c("wiki.age.months","lower.ci","upper.ci") plot2.data <- n.active.editors[is.finite(sd.units.active.editors) & wiki.age.months <= max.age.months,.(sd.units.active.editors = mean(sd.units.active.editors),N.active.editors = mean(N.active.editors),wiki.age.years=first(wiki.age.years),N.wikis=.N),by=.(wiki.age.months)] plot2.data[boot.ci,":="(lower.ci=lower.ci,upper.ci=upper.ci),on="wiki.age.months"] remember(plot2.data,'plot.active.editors.dt')