# Library containing code for processing wikiq tsvs into datasets
# Copyright (C) 2018 Nathan TeBlunthuis
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
library(urltools)
library(lubridate)
### is it more efficient to develop inside the loop or outside?
## with group by outside mclapply
## user system elapsed
## 3.743 8.112 6.219
## user system elapsed
## 609.715 592.603 638.172
## with group by inside mclapply
## user system elapsed
## 3.670 8.302 5.780
## user system elapsed
## 739.826 408.396 596.346
## conclusion: do as much outside mclapply as possible
build.newcomer.table.step1 <- function(wiki.list,
session.window.length = duration(1,units="hours"),
newcomer.period = duration(2*30,units="days"),
newcomer.sunset = duration(180,units="days"),
n.early.period.sessions = 1){
d.list <- mclapply(1:nrow(wiki.list),load.wikiq.files,wiki.list=wiki.list,mc.preschedule=F)
# d.list <- lapply(1:nrow(wiki.list),wiki.list=wiki.list,load.wikiq.files)
all.edits <- rbindlist(d.list)
all.edits[,
":="(time.first.edit = min(date.time),
time.last.edit = max(date.time)),
by=.(editor.id, wiki.name)]
all.edits[,
":="(editor=gsub("\"","",editor),
title=gsub("\"","",title),
reverteds=gsub("\"","",reverteds))]
all.edits <- all.edits[editor != "Default"]
all.edits[,month:=floor_date(date.time,unit="month")]
all.edits[,,by=.(wiki.name,editor)]
setkey(all.edits,wiki.name,editor.id,date.time)
## fix the definition of session to edits that have less than 1 hour together
all.edits[,":="(time.since.last.edit = diff(c(first(time.first.edit),date.time),lag=1,differences=1),
time.till.next.edit = diff(c(date.time,last(time.last.edit))),lag=1,differences=1,
editor.tenure =as.duration(max(date.time)-min(date.time))),
by=.(editor.id,wiki.name)]
all.edits[,":="(new.session = time.since.last.edit > session.window.length),by=.(editor.id,wiki.name)]
all.edits[,":="(nth.session = cumsum(new.session)),by=.(editor.id,wiki.name)]
all.edits[,":="(in.early.session = nth.session < n.early.period.sessions)]
all.edits[,
":="(is.reverted = any(reverted),
is.deleted = any(deleted),
p.reverted = mean(reverted & namespace ==0),
n.first.session=nrow(.SD[in.early.session==TRUE])),
by=.(editor.id,wiki.name)]
all.edits[,":="(age = as.duration(date.time - time.first.edit))]
all.edits[,":="(last.wiki.edit = max(date.time)),by=.(wiki.name)]
all.edits[,":="(is.newcomer = (age < newcomer.period) & (as.duration(last.wiki.edit - time.first.edit) > as.duration(newcomer.sunset)) & !anon)]
## did rejecting editors leave a comment on the talk page?
return(all.edits)
}
add.userroles <- function(all.edits,bots,admins){
bots[,":="(wiki.name = wiki,
editor = user
),
by=.(wiki,user)
]
admins[,":="(wiki.name = wiki,
editor = user),
by=.(wiki,user)]
all.edits[bots,
":="(
is.bot = i.is.bot
),
on=.(wiki.name,
editor,
date.time >= role.period.begin,
date.time <= role.period.end)
]
all.edits[admins,
":="(
is.admin = i.is.admin
),
on=.(wiki.name,
editor,
date.time >= role.period.begin,
date.time <= role.period.end)
]
all.edits[,":="(is.bot = ifelse(is.na(is.bot),FALSE,is.bot),
is.admin = ifelse(is.na(is.admin),FALSE,is.admin))]
all.edits[,":="(is.newcomer = (is.newcomer & !is.bot))]
return(all.edits)
}
identify.revert.messages <- function(all.edits, discussion.window = as.difftime(7,units="days"),week.length=as.difftime(7,units="days")){
all.edits[,user.talk:=as.factor(paste0("User talk:",as.character(all.edits$editor)))]
## join the talk page edits wit
all.edits[namespace==0,talk:=as.factor(paste0("Talk:",as.character(all.edits[namespace==0]$title)))]
print(" identifying reverts")
all.edits[!is.na(reverteds),reverted.edits := lapply(strsplit(reverteds,","),strtoi)]
all.edits[!is.na(reverteds),N.reverteds := lapply(reverted.edits,length)]
ns.edits = all.edits[namespace==0 | namespace==4]
reverted.lookup <- ns.edits[!is.na(reverteds),
.(revid = unlist(reverted.edits),
wiki.name = rep(wiki.name,N.reverteds),
reverted.by = rep(editor,N.reverteds),
reverted.by.bot = rep(is.bot, N.reverteds),
reverted.by.admin = rep(is.admin, N.reverteds),
revert.date.time = rep(date.time,N.reverteds),
revert.id = rep(revid,N.reverteds))]
reverted.edits <- ns.edits[reverted==TRUE]
reverted.edits[reverted.lookup,
":="(reverted.by = i.reverted.by,
reverted.by.bot = i.reverted.by.bot,
reverted.by.admin = i.reverted.by.admin,
revert.date.time = i.revert.date.time,
revert.id = revert.id),
on=.(wiki.name,revid)]
reverted.edits[,message.window.end:= revert.date.time + discussion.window]
## merge back revert info to all.edits
all.edits[reverted.edits,":="(
reverted.by = i.reverted.by,
reverted.by.bot = i.reverted.by.bot,
reverted.by.admin = i.reverted.by.admin,
revert.date.time = i.revert.date.time,
revert.id = revert.id,
message.window.end = message.window.end),
on = .(wiki.name, revid)]
print(" done")
print(" identifying editor talk page edits")
ns0.edits = all.edits[namespace==0]
## we want talkers who talk before the end of the window
talk.page.edits = all.edits[namespace==1]
talk.page.edits[,talk:=title]
## we only need to keep the key identifier for each revert
## use editor + title instead of revid since editors may have more than
## one edit reverted by a given revert.id.
## key = wiki.name,editor,title,revert.id,
setkeyv(reverted.edits,c("wiki.name","editor","title","revert.id"))
## condition where editor discusses after being reverted
editor.talks <- reverted.edits[talk.page.edits,
.(
wiki.name,
editor = x.editor,
revert.id = x.revert.id,
talk.id = i.revid,
talk.date.time=i.date.time
)
,on=.(editor,
wiki.name,
talk,
revert.date.time=date.time)
,nomatch=0L]
editor.talks <- editor.talks[,
.(
editor.talks = TRUE,
time.editor.talks = min(talk.date.time),
editor.talks.revid = min(talk.id)
),
by = .(wiki.name,editor,revert.id)
]
## merge back reverted edits to all.edits
all.edits[editor.talks,
":="(editor.talks = editor.talks,
time.editor.talks = time.editor.talks,
editor.talks.revid=editor.talks.revid),
on=.(wiki.name,editor,revert.id)]
## tidy up
rm(editor.talks, reverted.lookup)
print(" done")
print(" identifying reverter talk page edits")
all.edits[,":="(response.window.end = time.editor.talks + discussion.window)]
all.edits[(reverted==TRUE & is.na(editor.talks)), editor.talks := FALSE]
ns0.edits = all.edits[namespace==0]
reverted.edits <- ns0.edits[reverted==TRUE]
talk.page.edits <- all.edits[namespace==1]
talk.page.edits[,":="(talk = title,reverted.by=editor)]
# the key is still wiki.name, editor, revert.id
reverter.talks <- reverted.edits[talk.page.edits,
.(
wiki.name = wiki.name,
editor = x.editor,
revert.id = x.revert.id,
revert.date.time = x.revert.date.time,
time.reverter.talks = i.date.time,
reverter.talk.id = i.revid
),
,on=.(reverted.by,
wiki.name,
talk,
revert.date.time=date.time),
nomatch=0L]
reverter.talks <- reverter.talks[time.reverter.talks > revert.date.time,
.(
reverter.talks = TRUE,
time.reverter.talks = min(time.reverter.talks),
reverter.talk.id = min(reverter.talk.id)
),
by=.(wiki.name,editor,revert.id)
]
## merge back reverted.edits to all.edits
all.edits[reverter.talks,
":="(reverter.talks = reverter.talks,
time.reverter.talks = time.reverter.talks,
reverter.talk.id = reverter.talk.id),
on=.(wiki.name,editor,revert.id)]
## tidy up
rm(reverter.talks,talk.page.edits)
all.edits[(reverted == TRUE) & (is.na(reverter.talks)), reverter.talks := FALSE]
# if the editor didn't talk first, the time window is different
all.edits[reverter.talks == TRUE,
editor.talks.first := (time.editor.talks < time.reverter.talks)]
all.edits[(reverter.talks == TRUE) & (editor.talks.first==FALSE),
reverter.talks := time.reverter.talks < (date.time + discussion.window)]
print(" done")
print(" identifying User talk page edits")
## now do the same thing but for user talk pages
## did the reverter post on the editor's user talk page?
## key is wiki.name, title, reverted.by, revert.id
ns0.edits = all.edits[namespace==0]
user.talk.edits = all.edits[namespace==3]
user.talk.edits[,":="(reverted.by=editor,user.talk=title)]
reverted.edits = ns0.edits[reverted==TRUE]
reverter.messages = reverted.edits[user.talk.edits,
.(wiki.name = x.wiki.name,
title = x.title,
revert.id = x.revert.id,
editor = x.editor,
reverted.by = i.reverted.by,
time.reverter.messages=i.date.time,
reverter.messages.id=i.revid),
on=.(wiki.name,
reverted.by,
user.talk,
revert.date.time <= date.time,
message.window.end >= date.time
),
nomatch=0L]
reverter.messages = reverter.messages[,.(reverter.messages = TRUE,
time.reverter.messages = min(time.reverter.messages),
reverter.message.id = min(reverter.messages.id)),
by=.(wiki.name, editor, reverted.by, revert.id)]
reverted.edits[reverter.messages,":="(reverter.messages = reverter.messages,
time.reverter.messages = time.reverter.messages,
reverter.message.id = reverter.message.id),
on=.(wiki.name, editor, revert.id)]
reverted.edits[is.na(reverter.messages), reverter.messages := FALSE]
all.edits[reverted.edits,":="(reverter.messages = reverter.messages,
time.reverter.messages = time.reverter.messages,
reverter.message.id = reverter.message.id),
on=.(wiki.name, editor, revert.id)]
## set some wiki-level variables
print(" creating wiki windows")
setorderv(all.edits,cols=c("wiki.name","date.time","articleid"),order=1L)
all.edits[,":="(chars.change = diff(c(0L,text.chars),lag=1,differences=1),
creates.article = (date.time == min(date.time))
),by=.(wiki.name,articleid)]
setorderv(all.edits,cols=c("wiki.name","date.time","articleid"),order=1L)
# Some wikis got created by Wikia - invalidating wiki age that doesn't remove this editor
all.edits[,":="(wiki.birth.date = min(date.time)),by=.(wiki.name)]
all.edits[,":="(total.wiki.length = cumsum(chars.change),
n.articles = cumsum(creates.article),
wiki.age = as.duration(date.time - wiki.birth.date),
year = year(date.time)
),by=.(wiki.name)]
all.edits[,":="(wiki.age.months = floor(as.double(wiki.age,units='days')/30),
wiki.age.years = floor(as.double(wiki.age,units='years')))]
## generate breaks at precisely 1 week +/- the first edit.
date.range <- all.edits[,.(first.edit = min(date.time),last.edit = max(date.time)), by = .(wiki.name)]
window.breaks <- date.range[,.(breaks = seq(trunc(first.edit,"days"),
trunc(last.edit,"days"),
by=week.length),
break.next = seq(trunc(first.edit+week.length,"days"),
trunc(last.edit+week.length,"days"),
by=week.length)),
by=.(wiki.name)]
window.breaks[,
":="(i.break = 1:length(breaks))
,by=(wiki.name)]
all.edits[window.breaks,
":="(week = i.break
),
on=.(wiki.name, date.time <=break.next,date.time >=breaks)]
print(" done")
## tidy up
all.edits[,":="(reverted.edits = NULL,
N.reverteds = NULL,
user = NULL,
user.talk = NULL,
talk=NULL,
message.window.end=NULL,
response.window.end=NULL)]
print(" done")
rm(reverted.edits,reverter.messages,user.talk.edits,ns0.edits)
return(all.edits)
}
build.newcomers <- function(all.edits,
newcomer.period = duration(60,unit="days"),
newcomer.sunset= duration(30*6,unit="days")
){
setkeyv(all.edits,'date.time')
all.edits[,":="(time.last.edit.to.wiki = max(date.time)), by=.(wiki.name)]
all.edits <- all.edits[,time.till.page.edit := c(diff(date.time),as.numeric(NA)),by=.(wiki.name,articleid)]
all.edits <- all.edits[,last.edit.to.page :=is.na(time.till.page.edit)]
all.edits[last.edit.to.page == TRUE,time.till.page.edit := time.last.edit.to.wiki-date.time]
all.edits <- all.edits[,time.till.page.edit := log1p(as.numeric(time.till.page.edit,units='days'))]
editor.variables <- all.edits[,
.(survives = any( (age > newcomer.period) & (age < newcomer.sunset)),anon=first(anon),is.bot=any(is.bot),is.admin=any(is.admin)),
by = .(wiki.name,editor)
]
first.session.edits <- all.edits[in.early.session==TRUE]
first.session.edits[,":="(end.newcomer.period = time.first.edit + newcomer.period)]
print(" aggregating newcomer activity within wikis")
newcomers <- first.session.edits[namespace == 0,
.(
is.reverted = any(reverted & reverted.by != editor),
p.reverted = first(p.reverted),
is.bot.reverted = any(reverted.by.bot),
is.admin.reverted = any(reverted.by.admin),
is.reverted.messaged = any(reverter.messages |
reverter.talks,na.rm=TRUE),
reverter.talks = any(reverter.talks, na.rm=TRUE),
reverter.messages = any(reverter.messages, na.rm=TRUE),
editor.talks = any(editor.talks,na.rm=TRUE),
time.next.page.edit = min(time.till.next.edit, na.rm=TRUE),
BRD.initiation = any(editor.talks &
(editor.talks.first |
!reverter.talks), na.rm = TRUE),
BRD.reciprocation = any(editor.talks &
editor.talks.first &
reverter.talks, na.rm = TRUE),
reverter.initates.BRD = any(reverter.talks & (!editor.talks.first |
is.na(editor.talks.first)),na.rm=TRUE),
time.first.edit = first(time.first.edit),
time.till.page.edit = min(time.till.page.edit),
last.edit.to.page = all(last.edit.to.page),
end.newcomer.period = first(end.newcomer.period),
week = first(week),
year = first(year(time.first.edit)),
newcomer.edits = .N,
session.edits = first(n.first.session),
ns0.edits = sum(namespace == 0),
ns1.edits = sum(namespace == 1),
ns4.edits = sum(namespace == 4),
newcomer.chars.change = sum(chars.change),
newcomer.creates.article = any(creates.article),
wiki.type = first(wiki.type),
wiki.age = first(wiki.age)
),
by = .(wiki.name, editor)
]
newcomers[editor.variables,":="(survives = survives,is.bot=is.bot,is.admin=is.admin), on=.(wiki.name,editor)]
newcomers <- newcomers[!is.bot & !is.admin]
print(" done")
print(" identifying newcomer activity on other wikis")
newcomer.prior.wikis <- first.session.edits[newcomers,
.(
editor = editor,
wiki.name = i.wiki.name,
other.wiki = x.wiki.name,
time.first.edit.this = i.time.first.edit,
time.first.edit.other = x.time.first.edit
),
on=.(wiki.type,editor,time.first.edit < time.first.edit),
nomatch=0L,
allow.cartesian = TRUE
]
# using < time first edit should exlude edits to this wiki
newcomer.prior.wikis <- newcomer.prior.wikis[,.(n.edits.other = .N),
by=.(editor,wiki.name,other.wiki)]
newcomer.prior.wikis <- newcomer.prior.wikis[,
.(n.other.wikis = .N,
n.edits.other = sum(n.edits.other)),
by=.(wiki.name,editor)]
newcomer.prior.wikis <- newcomer.prior.wikis[newcomers,
.(
wiki.name=wiki.name,
editor=editor,
n.other.wikis = n.other.wikis,
n.edits.other = n.edits.other,
has.edited.other.wikis = (n.other.wikis > 0) & (!is.na(n.other.wikis))),
on=.(wiki.name,editor),
nomatch=NA]
newcomers <- newcomers[newcomer.prior.wikis,
":="(n.other.wikis = ifelse(is.na(i.n.other.wikis),0,i.n.other.wikis),
n.edits.other = ifelse(is.na(i.n.edits.other),0,i.n.edits.other),
has.edited.other.wikis = (i.n.other.wikis > 0) & (!is.na(i.n.other.wikis))),
on=.(wiki.name, editor)
]
newcomers[,":="(has.edited.other.wikis = ifelse(is.na(has.edited.other.wikis),FALSE,has.edited.other.wikis),
n.edits.other = ifelse(is.na(n.edits.other),0,n.edits.other),
n.other.wikis = ifelse(is.na(n.other.wikis),0,n.other.wikis)
)]
print(" done")
print(" identifying all messages")
user.talk.edits <- all.edits[namespace==3]
user.talk.edits[,user.talk:=title]
newcomers[,user.talk:= as.factor(paste0("User talk:",as.character(editor)))]
newcomer.messages <- user.talk.edits[newcomers,
.(
editor = i.editor,
n.messages = .N,
end.newcomer.period = i.end.newcomer.period
),
on=.(wiki.name,user.talk,date.time <= end.newcomer.period),
by=.EACHI,
nomatch=0L]
newcomer.messages <- newcomer.messages[newcomers,
.(wiki.name,
editor,
n.messages = x.n.messages,
is.messaged = (x.n.messages > 0) & (!is.na(x.n.messages))),
on=.(wiki.name,editor),
nomatch = NA]
newcomers <- newcomers[newcomer.messages,
":="(n.messages = ifelse(is.na(i.n.messages),0L,i.n.messages),
is.messaged = ifelse(is.na(i.n.messages),FALSE,i.is.messaged)),
on=.(wiki.name,editor)]
last.edit <- max(all.edits$date.time)
last.wikia.edit <- max(all.edits[wiki.type=="wikia",date.time])
newcomers <- newcomers[time.first.edit < last.edit - as.difftime(60,units="days")]
newcomers <- newcomers[(wiki.type == "wikia") & (time.first.edit < (last.wikia.edit - as.difftime(60,units="days")))]
print(" done")
return(newcomers)
}
build.namespace4.dataset <- function(all.edits, week.length = as.difftime(7,units="days")){
ns4.reg.edits <- all.edits[(namespace==4) & (anon==FALSE)]
return(ns4.reg.edits)
}
build.wiki.level.variables <- function(all.edits, week.length = as.difftime(7,units="days")){
wiki.data <- all.edits[,.(n.editors = length(unique(editor)),
total.wiki.length=last(total.wiki.length)
)
,by=.(wiki.name,week)]
wiki.ns4.data <- all.edits[namespace==4,
.(n.ns4.edits = .N,
n.ns4.editors = length(unique(editor)),
d.ns4.length = sum(chars.change),
ns4.editor.age = mean(age)
),
by=.(wiki.name, week)]
wiki.ns0.data <- all.edits[namespace==0,
.(revert.rate = mean(reverted,na.rm=TRUE),
newcomer.revert.rate = sum((reverted & is.newcomer),na.rm=TRUE)/sum(is.newcomer,na.rm=TRUE),
revert.disc.rate = sum((reverted & reverter.talks),na.rm=TRUE)/sum(reverted,na.rm=TRUE),
newcomer.revert.disc.rate = sum((reverted & reverter.talks & is.newcomer),na.rm=TRUE)/ sum(reverted & is.newcomer,na.rm=TRUE),
revert.message.rate = sum((reverted & reverter.messages),na.rm=TRUE)/sum(reverted,na.rm=TRUE),
newcomer.revert.message.rate = sum((reverted & reverter.messages & is.newcomer),na.rm=TRUE)/sum((reverted & is.newcomer),na.rm=TRUE),
newcomer.edits.rate = mean(is.newcomer,na.rm=TRUE),
bot.revert.rate = mean(reverted.by.bot,na.rm=TRUE),
bot.revert.prop = sum(reverted.by.bot,na.rm=TRUE)/sum(reverted,na.rm=TRUE),
newcomer.bot.revert.rate = mean((reverted.by.bot & is.newcomer),na.rm=TRUE),
newcomer.bot.revert.prop = sum((reverted.by.bot & is.newcomer),na.rm=TRUE)/sum((reverted & is.newcomer),na.rm=TRUE),
admin.revert.rate = mean(reverted.by.admin,na.rm=TRUE),
admin.revert.prop = sum(reverted.by.admin,na.rm=TRUE)/sum(reverted,na.rm=TRUE),
year = year(first(date.time)),
month = month(first(date.time))),
by=.(wiki.name,week)]
## replace NAs with 0
wiki.ns0.data[,
":="(
# revert.rate = ifelse(is.na(revert.rate),0,revert.rate),
revert.disc.rate = ifelse(is.na(revert.disc.rate),0,revert.disc.rate),
newcomer.revert.disc.rate = ifelse(is.na(newcomer.revert.disc.rate),0,newcomer.revert.disc.rate),
revert.message.rate = ifelse(is.na(revert.message.rate),0,revert.message.rate),
newcomer.revert.message.rate = ifelse(is.na(newcomer.revert.message.rate),0,newcomer.revert.message.rate),
newcomer.edits.rate = ifelse(is.na(newcomer.edits.rate),0,newcomer.edits.rate),
bot.revert.rate = ifelse(is.na(bot.revert.rate),0,bot.revert.rate),
bot.revert.prop = ifelse(is.na(bot.revert.prop),0,bot.revert.prop),
newcomer.bot.revert.rate = ifelse(is.na(newcomer.bot.revert.rate),0,newcomer.bot.revert.rate),
newcomer.bot.revert.prop = ifelse(is.na(newcomer.bot.revert.prop),0,newcomer.bot.revert.prop),
admin.revert.rate = ifelse(is.na(admin.revert.rate),0,admin.revert.rate),
admin.revert.prop = ifelse(is.na(admin.revert.prop),0,admin.revert.prop)),
]
## bring it together
wiki.data[wiki.ns0.data,
":="(
revert.rate = i.revert.rate,
revert.disc.rate = i.revert.disc.rate,
newcomer.revert.disc.rate = i.newcomer.revert.disc.rate,
revert.message.rate = i.revert.message.rate,
newcomer.revert.message.rate = i.newcomer.revert.message.rate,
newcomer.edits.rate = i.newcomer.edits.rate,
bot.revert.rate = i.bot.revert.rate,
bot.revert.prop = i.bot.revert.prop,
newcomer.bot.revert.rate = i.newcomer.bot.revert.rate,
newcomer.bot.revert.prop = i.newcomer.bot.revert.prop,
admin.revert.rate = i.admin.revert.rate,
admin.revert.prop = i.admin.revert.prop),
on=.(wiki.name,week)]
wiki.data[wiki.ns4.data,
":="(
n.ns4.edits = i.n.ns4.edits,
n.ns4.editors = i.n.ns4.editors,
d.ns4.length = i.d.ns4.length,
ns4.editor.age = i.ns4.editor.age
),
on=.(wiki.name,week)]
# create variables for community size in standard deviation units
return(wiki.data)
}
load.all.edits <- function(){
if(!exists("all.edits")){
file.name <- "all.edits.RDS"
if(!file.exists(file.name)){
print("loading wikiq data")
all.edits <- build.newcomer.table.step1(wiki.list, newcomer.period = newcomer.period)
print("done")
print("adding user role data")
all.edits <- add.userroles(all.edits,bots=bots,admins=admins)
print("done")
print("identifying reverts and messages")
all.edits <- identify.revert.messages(all.edits,week.length=as.difftime(7,units="days"))
print("done")
if(!nosave){
print("saving work")
saveRDS(all.edits,file.name)
print("done")
}
} else{
print("loading wikiq data with reverts and messages")
all.edits <- readRDS(file.name)
print("done")
}
remember(min(all.edits$date.time),"earliest.data.point")
remember(max(all.edits$date.time),"latest.data.point")
## make all.edits a global variable
all.edits <<- all.edits
}
}
newcomer.period = duration(2*30,unit="days")
newcomer.sunset = duration(30*6,unit="days")
week.length=duration(7,unit="days")
remember(newcomer.period)
remember(newcomer.sunset)
remember(week.length)
## try loading newcomers
if(!exists("newcomers")){
file.name2 <- "newcomers.RDS"
if(file.exists(file.name2)){
newcomers <- readRDS(file.name2)
} else{
print("building newcomers table")
load.all.edits()
newcomers <- build.newcomers(all.edits,
newcomer.sunset = newcomer.sunset,
newcomer.period=newcomer.period)
print("done")
print("saving work")
if(!nosave){
saveRDS(newcomers,file.name2)
}
}
}
if(!exists("ns4.reg.edits")){
file.name <- "ns4.reg.edits.RDS"
if(file.exists(file.name)){
ns4.reg.edits <- readRDS(file.name)
} else{
print("building ns4 edits table")
## create table of namespace 4 edits from all edits
load.all.edits()
ns4.reg.edits <- build.namespace4.dataset(all.edits)
print("done")
print("saving work")
if(!nosave){
saveRDS(ns4.reg.edits,file.name)
}
}
}
if(!exists("wiki.data")){
file.name3 <- "wikiweeks.RDS"
if(!file.exists(file.name3)){
print("building wiki level variable")
load.all.edits()
wiki.data <- build.wiki.level.variables(all.edits, week.length=week.length)
print("done")
print("saving work")
if(!nosave){
saveRDS(wiki.data,file.name3)
}
print("done")
}
else{
wiki.data <- readRDS(file.name3)
}
}
#wikis.to.remove <- newcomers[,.N,by="wiki.name"][N<30]$wiki.name
#remember(nrow(wikis.to.remove),"n.wikis.insufficient.newcomers")
#newcomers <- newcomers[!(wiki.name %in% wikis.to.remove)]
#all.edits <- all.edits[!(wiki.name %in% wikis.to.remove)]
if(!exists("wiki.stats")){
file.name <- "wiki.stats.RDS"
if(!file.exists(file.name)){
load.all.edits()
editor.tenures <- all.edits[,.(tenure=first(editor.tenure)),by=.(wiki.name,editor)]
wiki.stats <- all.edits[,.(total.editors = length(unique(editor)),
total.edits = .N,
total.reverts = sum(reverted),
total.bot.reverts = sum(reverted.by.bot,na.rm=TRUE),
total.ns4.edits = nrow(.SD[namespace==4]),
med.edit.tenure = median(editor.tenure)
),by=.(wiki.name)]
med.editor.tenure <- editor.tenures[,.(med.editor.tenure=median(tenure)),by=.(wiki.name)]
wiki.stats[med.editor.tenure,med.tenure := med.editor.tenure,on="wiki.name"]
newcomer.stats <- newcomers[,.(retention.rate = mean(survives),
reverted.newcomers = sum(is.reverted)
),by=.(wiki.name)]
wiki.stats <- wiki.stats[newcomer.stats,':='(retention.rate = retention.rate, reverted.newcomers = reverted.newcomers), on="wiki.name"]
remember(wiki.stats,silent=TRUE)
saveRDS(wiki.stats,file.name)
} else {
wiki.stats <- readRDS("wiki.stats.RDS")
}
}
row1 <- c("total.editors","total.reverts","total.bot.reverts","total.ns4.edits")
row2 <- c("med.editor.tenure","retention.rate")
m.wiki.stats <- melt(wiki.stats,id='wiki.name',measure.vars = c("total.editors","total.reverts","total.bot.reverts","total.ns4.edits"))
m.wiki.stats[variable %in% row1, ":="(row = 1,col=which(row1 == variable,useNames=F)),by=variable]
m.wiki.stats[variable %in% row2, ":="(row = 2,col=which(row2 == variable,useNames=F)),by=variable]
m.wiki.stats <- m.wiki.stats[value != 0 | variable != "total.bot.reverts"]
m.wiki.stats <- m.wiki.stats[value == 0 & variable != "total.bot.reverts", value := 1]
friendly.var <- function(varname){
sapply(as.character(varname),function(f) switch(f,
total.editors='Editors',
total.reverts='Reverts',
total.bot.reverts='Bot reverts',
total.ns4.edits='Edits to the project namespace'))
}
var.id <- function(varname){
sapply(as.character(varname),function(f) switch(f,
total.editors=1,
total.reverts=2,
total.bot.reverts=3,
total.ns4.edits=4))
}
med.line.width <- 1
m.wiki.stats[,variable := friendly.var(variable)]
m.wiki.stats <- m.wiki.stats[,variable:=factor(variable,levels=c('Editors',"Reverts","Bot reverts","Edits to the project namespace"))]
spoke.data <- m.wiki.stats[,.(y = median(value)),by=variable]
remember(m.wiki.stats)
remember(spoke.data)
remember(nrow(wiki.stats),"n.wikia.wikis")
## join wiki-level variables with newcomer variables to get ready to model newcomer retention.
newcomers <- newcomers[wiki.data,
":="(
wiki.name=i.wiki.name,
week = i.week,
n.editors = i.n.editors,
total.wiki.length = i.total.wiki.length,
revert.rate = i.revert.rate,
revert.disc.rate = i.revert.disc.rate,
newcomer.revert.disc.rate = i.newcomer.revert.disc.rate,
revert.message.rate = i.revert.message.rate,
newcomer.revert.message.rate = i.newcomer.revert.message.rate,
newcomer.edits.rate = i.newcomer.edits.rate,
bot.revert.rate = i.bot.revert.rate,
bot.revert.prop = i.bot.revert.prop,
newcomer.bot.revert.rate = i.newcomer.bot.revert.rate,
newcomer.bot.revert.prop = i.newcomer.bot.revert.prop,
admin.revert.rate = i.admin.revert.rate,
admin.revert.prop = i.admin.revert.prop,
n.ns4.edits = i.n.ns4.edits,
n.ns4.editors = i.n.ns4.editors,
d.ns4.length = i.d.ns4.length,
ns4.editor.age = i.ns4.editor.age,
wiki.age.weeks = as.double(wiki.age,units='days')/7,
wiki.age.months = floor(as.double(wiki.age,units='days')/30),
wiki.age.half.years = floor(as.double(wiki.age,units='years')*2),
wiki.age.years = floor(as.double(wiki.age,units='years')),
quarter = factor(floor_date(time.first.edit,unit="3 months"))
),
on=.(wiki.name,week)
]
survival.data <- newcomers[,.(wiki.name,
week,
survival.rate = mean(survives),
n.newcomers = .N),
by = .(wiki.name, week)]
wiki.data <- wiki.data[survival.data,
":="(
survival.rate = survival.rate,
n.newcomers = n.newcomers),
on = .(wiki.name,week)]
file.name <- "active.editors.RDS"
if(!file.exists(file.name)){
load.all.edits()
active.editors <- all.edits[,
.(N.edits=.N,
wiki.age.years=first(wiki.age.years)),
by=.(wiki.name,
editor,
wiki.age.months)]
saveRDS(active.editors, file.name)
} else {
active.editors <- readRDS(file.name)
}