#!/usr/bin/env Rscript # Fits newcomer retention models # Copyright (C) 2018 Nathan TeBlunthuis # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program. If not, see . library(scales) if(!exists("newcomers")){ source("01_build_datasets.R") } use.sample <- FALSE if(use.sample == TRUE){ source("lib-01-sample-datasets.R") newcomer.ds <- sample.newcomers() }else{ newcomer.ds <- newcomers } library("optimx") library("lme4") newcomer.ds <- newcomer.ds[,wiki:=as.factor(wiki.name)] newcomer.ds <- newcomer.ds[,":="( wiki.age.log = log1p(as.double(wiki.age,units='weeks')), is.bot.reverted = ifelse(is.na(is.bot.reverted),FALSE,is.bot.reverted), is.admin.reverted = ifelse(is.na(is.admin.reverted),FALSE,is.admin.reverted), year = as.factor(year(time.first.edit)), month = as.factor(paste0(year(time.first.edit),month(time.first.edit))), ns0.edits.log = log1p(ns0.edits), ns1.edits.log = log1p(ns1.edits), ns4.edits.log = log1p(ns4.edits), n.other.wikis.log = log1p(n.other.wikis), n.edits.other.log = log1p(n.edits.other), n.messages.log = log1p(n.messages), n.editors.log = log1p(n.editors), total.wiki.length.log = log1p(total.wiki.length), n.ns4.edits.log = log1p(n.ns4.edits), n.ns4.editors.log = log1p(n.ns4.editors), ns4.editor.age.log = log1p(as.double(ns4.editor.age,units='years')), d.ns4.length.scaled = scale(d.ns4.length), newcomer.chars.changed.scaled = scale(newcomer.chars.change), session.edits.log = log1p(session.edits), wiki.age = as.double(wiki.age,units='years') )] ## record summary stats for our analytic variables newcomer.summary.stats <- list() newcomer.summary.stats$p.survives <- mean(newcomer.ds$survives) newcomer.summary.stats$var.survives <- var(newcomer.ds$survives) outliers <- newcomer.ds[session.edits >= 100] newcomer.summary.stats$N.outliers <- nrow(outliers) newcomer.summary.stats$p.first.session.no.outliers <- mean(newcomer.ds[session.edits < 100]$session.edits) newcomer.summary.stats$var.first.session.no.outliers <- var(newcomer.ds[session.edits < 100]$session.edits) newcomer.summary.stats$p.reverted <- mean(newcomer.ds$is.reverted) newcomer.summary.stats$var.reverted <- var(newcomer.ds$is.reverted) newcomer.summary.stats$p.messaged <- mean(newcomer.ds$is.messaged) newcomer.summary.stats$var.messaged <- var(newcomer.ds$is.messaged) newcomer.summary.stats$mean.first.session.edits <- mean(newcomer.ds$session.edits) newcomer.summary.stats$var.first.session.edits <- var(newcomer.ds$session.edits) newcomer.summary.stats$med.first.session.edits <- median(newcomer.ds$session.edits) newcomer.summary.stats$p.bot.reverted <- mean(newcomer.ds$is.bot.reverted) newcomer.summary.stats$var.bot.reverted <- var(newcomer.ds$is.bot.reverted) remember(newcomer.summary.stats) halfak.formula <- as.formula("survives ~ is.reverted + is.messaged + is.bot.reverted + session.edits.log + wiki.age + quarter + wiki.name") newcomer.ds.all <- newcomer.ds newcomer.ds <- newcomer.ds[n.other.wikis==0] print('fitting halfak model on all newcomers') halfak.mod.all.newcomers <- glm(halfak.formula,data=newcomer.ds.all,family=binomial(link=logit)) saveRDS(halfak.mod.all.newcomers,"halfak.mod.all.newcomers.RDS") remember(extract(halfak.mod.all.newcomers),"halfak.model.all.newcomers",silent=TRUE) print("fitting halfak model") halfak.mod <- glm(halfak.formula,data=newcomer.ds,family=binomial(link=logit)) saveRDS(halfak.mod,"halfak.mod.RDS") remember(extract(halfak.mod),"halfak.model",silent=TRUE) print('fitting halfak model with weights') n.total.wikis <- length(unique(newcomer.ds$wiki.name)) weight.per.wiki <- nrow(newcomer.ds)/n.total.wikis newcomer.ds <- newcomer.ds[,weights:=weight.per.wiki/.N,by=wiki.name] halfak.mod.weighted <- glm(halfak.formula,data=newcomer.ds,family=binomial(link=logit),weights=newcomer.ds$weights) saveRDS(halfak.mod.weighted,"halfak.mod.weighted.RDS") remember(extract(halfak.mod.weighted),"halfak.model.weighted",silent=TRUE) ## print('fit halfak model on a sample') ## sample.size <- 30 ## newcomer.ds <- newcomer.ds[,in.sample:=.N >= sample.size, by=wiki.name] ## newcomer.ds.sample <- newcomer.ds[,.SD[sample(.N,min(sample.size,.N))],by=wiki.name] ## halfak.mod.sample <- glm(halfak.formula,data=newcomer.ds.sample,family=binomial(link=logit)) ## saveRDS(halfak.mod.sample,"halfak.mod.sample.RDS") ## remember(extract(halfak.mod.sample),"halfak.model.sample",silent=TRUE) print('fitting RE model') library("optimx") print('fitting re model') re.icc.survives.model <- glmer(as.formula("survives ~ + (1 | wiki) - 1"),data=newcomer.ds,family=binomial(link=logit)) saveRDS(re.icc.survives.model,"re.icc.survives.model.RDS") varcorrmat <- as.data.table(VarCorr(re.icc.survives.model)) wiki.var <- varcorrmat[grp=='wiki' & var1=="(Intercept)" ,vcov] group.var <- var(residuals(re.icc.survives.model)) icc <- wiki.var/(group.var + wiki.var) remember(varcorrmat,'icc.survives.varcormat') remember(group.var,'icc.survives.group.var') remember(icc,'icc.survives') ## newcomer.no.pooling.f <- as.formula("survives ~ is.reverted:wiki.name + is.messaged:wiki.name + is.bot.reverted:wiki.name + session.edits.log:wiki.name + wiki.name + quarter:wiki.name + wiki.name:wiki.age - 1") ## newcomer.no.pooling.mod <- glm(newcomer.no.pooling.f,gdata=newcomer.ds,family=binomial(link=logit)) ## remember(extract(newcomer.no.pooling.mod),"newcomer.no.pooling.mod",silent=TRUE) ## if( !(exists("halfak.robustnes1.mod") | file.exists("halfak.robustness1.mod.RDS")) | refit.models == TRUE){ ## halfak.robustness1.formula <- as.formula("survives ~ is.reverted + is.messaged + is.bot.reverted + session.edits.log + wiki + quarter + wiki:wiki.age") ## print("fitting halfak robustness 1 model") ## newcomer.robustness.ds <- newcomer.ds[p.reverted <= 0.05] ## halfak.robustness1.mod <- glm(halfak.robustness1.formula,data=newcomer.robustness.ds,family=binomial(link=logit)) ## saveRDS(halfak.robustness1.mod,"halfak.robustness1.mod.RDS") ## remember(extract(halfak.robustness1.mod),"halfak.robustness1.model") ## } ## else if(file.exists("halfak.robustness1.mod.RDS") & !exists("halfak.robustness1.mod")){ ## newcomer.no.pooling.mod <- readRDS("halfak.robustness1.mod.RDS") ## } ## else if (exists("halfak.robustness1.mod")){ ## saveRDS(halfak.robustness1.mod,"halfak.robustness1.mod.RDS") ## } ## remember(extract(halfak.robustness1.mod),"halfak.robustness1.mod") ## } ## if( !(exists("halfak.robustnes2.mod") | file.exists("halfak.robustness1.mod.RDS")) | refit.models == TRUE){ ## halfak.robustness2.formula <- as.formula("survives ~ is.reverted + is.messaged + is.bot.reverted + session.edits .log + wiki + quarter + wiki:wiki.age") ## print("fitting halfak robustness 2 model") ## newcomer.robustness.ds2 <- newcomer.ds[p.reverted <= 0.5] ## halfak.robustness2.mod <- glm(halfak.robustness2.formula,data=newcomer.robustness.ds2,family=binomial(link=logit)) ## saveRDS(halfak.robustness1.mod,"halfak.robustness2.mod.RDS") ## remember(extract(halfak.robustness1.mod),"halfak.robustness2.model") ## } ## else if(file.exists("halfak.robustness2.mod.RDS") & !exists("halfak.robustness2.mod")){ ## halfak.robustness2.mod <- readRDS("halfak.robustness2.mod.RDS") ## } ## else if (exists("halfak.robustness2.mod")){ ## saveRDS(halfak.robustness2.mod,"halfak.robustness2.mod.RDS") ## } ## remember(extract(halfak.robustness2.mod),"halfak.robustness2.mod") ## }