]> code.communitydata.science - rises_declines_wikia_code.git/blob - 01_build_datasets.R
add copy of the GPL
[rises_declines_wikia_code.git] / 01_build_datasets.R
1 #!/usr/bin/env Rscript
2 # Top level script for building datasets. 
3 # Copyright (C) 2018  Nathan TeBlunthuis
4
5 # This program is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation, either version 3 of the License, or
8 # (at your option) any later version.
9
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 # GNU General Public License for more details.
14
15 # You should have received a copy of the GNU General Public License
16 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
17
18 library(data.table)
19 library(parallel)
20
21 plot.distribution <- function(data,varname,save=TRUE){
22     x = data[[varname]];
23     print(paste("plotting distribution for",varname))
24     if(save){
25         pdf(paste0("plots/",varname,".distribution.pdf"))
26     }
27     ## overlay histogram, empirical density and normal density
28     if(class(x) == "logical"){
29         p0 <- qplot(x)
30     }
31     else{
32     
33         p0 = qplot(x, geom = 'blank') +   
34         geom_line(aes(y = ..density.., colour = 'Empirical'), stat = 'density') +  
35         geom_histogram(aes(y = ..density..), alpha = 0.4,bins=100) +                        
36         scale_colour_manual(name = 'Density', values = c('red', 'blue')) + 
37         theme(legend.position = c(0.85, 0.85))
38     }    
39     print(p0)
40     if(save){
41     dev.off()
42     }
43 }
44
45 if(!exists("wiki.list")){
46     source("lib-00-utils.R",echo=TRUE)
47 }
48
49 if(!exists("bots") | !exists("admins")){
50     if(file.exists("bots.RDS") & file.exists("admins.RDS")){
51         bots = readRDS("bots.RDS")
52         admins = readRDS("admins.RDS")
53     }
54     else {
55         source("lib-01-generate_userroles.R",echo=TRUE)
56     }
57 }
58
59 if(!exists("newcomer.dt")){
60     intermediate.files <- list("newcomers.RDS","wikiweeks.RDS","wiki.stats.RDS","active.editors.RDS")
61     if(! all(sapply(intermediate.files,function (x) file.exists(x)))){
62         source("lib-01-build_newcomer_table.R",echo=TRUE)
63     }
64 }
65
66 plot.distributions = FALSE
67 if(plot.distributions == TRUE){
68     library(ggplot2)
69     ## plot distributions for model 1
70     outcome1 <- c("survives")
71     predictors1 <- c("is.reverted","is.messaged","is.bot.reverted","is.reverted.messaged","is.admin.reverted","BRD.initiation","BRD.reciprocation")
72     controls1 <- c("ns0.edits","ns1.edits","ns4.edits","n.other.wikis","week","has.edited.other.wikis","n.edits.other","n.messages","n.editors","total.wiki.length","revert.rate","revert.disc.rate","newcomer.revert.disc.rate","revert.message.rate","newcomer.revert.message.rate","newcomer.edits.rate","bot.revert.rate","bot.revert.prop","newcomer.bot.revert.rate","newcomer.bot.revert.prop","admin.revert.rate","admin.revert.prop","n.ns4.edits","n.ns4.editors","d.ns4.length","ns4.editor.age","age","wiki.age")
73
74     for(varname in c(outcome1,predictors1,controls1)){
75         plot.distribution(newcomers,varname)
76     }
77 }

Community Data Science Collective || Want to submit a patch?