3 # Fits newcomer retention models
5 # Copyright (C) 2018 Nathan TeBlunthuis
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <https://www.gnu.org/licenses/>.
21 if(!exists("newcomers")){
22 source("01_build_datasets.R")
26 if(use.sample == TRUE){
27 source("lib-01-sample-datasets.R")
28 newcomer.ds <- sample.newcomers()
30 newcomer.ds <- newcomers
36 newcomer.ds <- newcomer.ds[,wiki:=as.factor(wiki.name)]
38 newcomer.ds <- newcomer.ds[,":="(
39 wiki.age.log = log1p(as.double(wiki.age,units='weeks')),
40 is.bot.reverted = ifelse(is.na(is.bot.reverted),FALSE,is.bot.reverted),
41 is.admin.reverted = ifelse(is.na(is.admin.reverted),FALSE,is.admin.reverted),
42 year = as.factor(year(time.first.edit)),
43 month = as.factor(paste0(year(time.first.edit),month(time.first.edit))),
44 ns0.edits.log = log1p(ns0.edits),
45 ns1.edits.log = log1p(ns1.edits),
46 ns4.edits.log = log1p(ns4.edits),
47 n.other.wikis.log = log1p(n.other.wikis),
48 n.edits.other.log = log1p(n.edits.other),
49 n.messages.log = log1p(n.messages),
50 n.editors.log = log1p(n.editors),
51 total.wiki.length.log = log1p(total.wiki.length),
52 n.ns4.edits.log = log1p(n.ns4.edits),
53 n.ns4.editors.log = log1p(n.ns4.editors),
54 ns4.editor.age.log = log1p(as.double(ns4.editor.age,units='years')),
55 d.ns4.length.scaled = scale(d.ns4.length),
56 newcomer.chars.changed.scaled = scale(newcomer.chars.change),
57 session.edits.log = log1p(session.edits),
58 wiki.age = as.double(wiki.age,units='years')
61 ## record summary stats for our analytic variables
62 newcomer.summary.stats <- list()
63 newcomer.summary.stats$p.survives <- mean(newcomer.ds$survives)
64 newcomer.summary.stats$var.survives <- var(newcomer.ds$survives)
66 outliers <- newcomer.ds[session.edits >= 100]
67 newcomer.summary.stats$N.outliers <- nrow(outliers)
68 newcomer.summary.stats$p.first.session.no.outliers <- mean(newcomer.ds[session.edits < 100]$session.edits)
69 newcomer.summary.stats$var.first.session.no.outliers <- var(newcomer.ds[session.edits < 100]$session.edits)
71 newcomer.summary.stats$p.reverted <- mean(newcomer.ds$is.reverted)
72 newcomer.summary.stats$var.reverted <- var(newcomer.ds$is.reverted)
73 newcomer.summary.stats$p.messaged <- mean(newcomer.ds$is.messaged)
74 newcomer.summary.stats$var.messaged <- var(newcomer.ds$is.messaged)
75 newcomer.summary.stats$mean.first.session.edits <- mean(newcomer.ds$session.edits)
76 newcomer.summary.stats$var.first.session.edits <- var(newcomer.ds$session.edits)
77 newcomer.summary.stats$med.first.session.edits <- median(newcomer.ds$session.edits)
78 newcomer.summary.stats$p.bot.reverted <- mean(newcomer.ds$is.bot.reverted)
79 newcomer.summary.stats$var.bot.reverted <- var(newcomer.ds$is.bot.reverted)
80 remember(newcomer.summary.stats)
82 halfak.formula <- as.formula("survives ~ is.reverted + is.messaged + is.bot.reverted + session.edits.log + wiki.age + quarter + wiki.name")
84 newcomer.ds.all <- newcomer.ds
85 newcomer.ds <- newcomer.ds[n.other.wikis==0]
87 print('fitting halfak model on all newcomers')
88 halfak.mod.all.newcomers <- glm(halfak.formula,data=newcomer.ds.all,family=binomial(link=logit))
89 saveRDS(halfak.mod.all.newcomers,"halfak.mod.all.newcomers.RDS")
90 remember(extract(halfak.mod.all.newcomers),"halfak.model.all.newcomers",silent=TRUE)
92 print("fitting halfak model")
93 halfak.mod <- glm(halfak.formula,data=newcomer.ds,family=binomial(link=logit))
94 saveRDS(halfak.mod,"halfak.mod.RDS")
95 remember(extract(halfak.mod),"halfak.model",silent=TRUE)
97 print('fitting halfak model with weights')
98 n.total.wikis <- length(unique(newcomer.ds$wiki.name))
99 weight.per.wiki <- nrow(newcomer.ds)/n.total.wikis
100 newcomer.ds <- newcomer.ds[,weights:=weight.per.wiki/.N,by=wiki.name]
101 halfak.mod.weighted <- glm(halfak.formula,data=newcomer.ds,family=binomial(link=logit),weights=newcomer.ds$weights)
102 saveRDS(halfak.mod.weighted,"halfak.mod.weighted.RDS")
103 remember(extract(halfak.mod.weighted),"halfak.model.weighted",silent=TRUE)
105 ## print('fit halfak model on a sample')
107 ## newcomer.ds <- newcomer.ds[,in.sample:=.N >= sample.size, by=wiki.name]
108 ## newcomer.ds.sample <- newcomer.ds[,.SD[sample(.N,min(sample.size,.N))],by=wiki.name]
109 ## halfak.mod.sample <- glm(halfak.formula,data=newcomer.ds.sample,family=binomial(link=logit))
110 ## saveRDS(halfak.mod.sample,"halfak.mod.sample.RDS")
111 ## remember(extract(halfak.mod.sample),"halfak.model.sample",silent=TRUE)
113 print('fitting RE model')
115 print('fitting re model')
116 re.icc.survives.model <- glmer(as.formula("survives ~ + (1 | wiki) - 1"),data=newcomer.ds,family=binomial(link=logit))
117 saveRDS(re.icc.survives.model,"re.icc.survives.model.RDS")
118 varcorrmat <- as.data.table(VarCorr(re.icc.survives.model))
119 wiki.var <- varcorrmat[grp=='wiki' & var1=="(Intercept)" ,vcov]
120 group.var <- var(residuals(re.icc.survives.model))
121 icc <- wiki.var/(group.var + wiki.var)
122 remember(varcorrmat,'icc.survives.varcormat')
123 remember(group.var,'icc.survives.group.var')
124 remember(icc,'icc.survives')
126 ## newcomer.no.pooling.f <- as.formula("survives ~ is.reverted:wiki.name + is.messaged:wiki.name + is.bot.reverted:wiki.name + session.edits.log:wiki.name + wiki.name + quarter:wiki.name + wiki.name:wiki.age - 1")
127 ## newcomer.no.pooling.mod <- glm(newcomer.no.pooling.f,gdata=newcomer.ds,family=binomial(link=logit))
128 ## remember(extract(newcomer.no.pooling.mod),"newcomer.no.pooling.mod",silent=TRUE)
130 ## if( !(exists("halfak.robustnes1.mod") | file.exists("halfak.robustness1.mod.RDS")) | refit.models == TRUE){
131 ## halfak.robustness1.formula <- as.formula("survives ~ is.reverted + is.messaged + is.bot.reverted + session.edits.log + wiki + quarter + wiki:wiki.age")
132 ## print("fitting halfak robustness 1 model")
133 ## newcomer.robustness.ds <- newcomer.ds[p.reverted <= 0.05]
134 ## halfak.robustness1.mod <- glm(halfak.robustness1.formula,data=newcomer.robustness.ds,family=binomial(link=logit))
135 ## saveRDS(halfak.robustness1.mod,"halfak.robustness1.mod.RDS")
136 ## remember(extract(halfak.robustness1.mod),"halfak.robustness1.model")
138 ## else if(file.exists("halfak.robustness1.mod.RDS") & !exists("halfak.robustness1.mod")){
139 ## newcomer.no.pooling.mod <- readRDS("halfak.robustness1.mod.RDS")
141 ## else if (exists("halfak.robustness1.mod")){
142 ## saveRDS(halfak.robustness1.mod,"halfak.robustness1.mod.RDS")
144 ## remember(extract(halfak.robustness1.mod),"halfak.robustness1.mod")
147 ## if( !(exists("halfak.robustnes2.mod") | file.exists("halfak.robustness1.mod.RDS")) | refit.models == TRUE){
148 ## halfak.robustness2.formula <- as.formula("survives ~ is.reverted + is.messaged + is.bot.reverted + session.edits .log + wiki + quarter + wiki:wiki.age")
149 ## print("fitting halfak robustness 2 model")
150 ## newcomer.robustness.ds2 <- newcomer.ds[p.reverted <= 0.5]
151 ## halfak.robustness2.mod <- glm(halfak.robustness2.formula,data=newcomer.robustness.ds2,family=binomial(link=logit))
152 ## saveRDS(halfak.robustness1.mod,"halfak.robustness2.mod.RDS")
153 ## remember(extract(halfak.robustness1.mod),"halfak.robustness2.model")
155 ## else if(file.exists("halfak.robustness2.mod.RDS") & !exists("halfak.robustness2.mod")){
156 ## halfak.robustness2.mod <- readRDS("halfak.robustness2.mod.RDS")
158 ## else if (exists("halfak.robustness2.mod")){
159 ## saveRDS(halfak.robustness2.mod,"halfak.robustness2.mod.RDS")
161 ## remember(extract(halfak.robustness2.mod),"halfak.robustness2.mod")