# Library containing code for processing wikiq tsvs into datasets # Copyright (C) 2018 Nathan TeBlunthuis # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program. If not, see . library(urltools) library(lubridate) ### is it more efficient to develop inside the loop or outside? ## with group by outside mclapply ## user system elapsed ## 3.743 8.112 6.219 ## user system elapsed ## 609.715 592.603 638.172 ## with group by inside mclapply ## user system elapsed ## 3.670 8.302 5.780 ## user system elapsed ## 739.826 408.396 596.346 ## conclusion: do as much outside mclapply as possible build.newcomer.table.step1 <- function(wiki.list, session.window.length = duration(1,units="hours"), newcomer.period = duration(2*30,units="days"), newcomer.sunset = duration(180,units="days"), n.early.period.sessions = 1){ d.list <- mclapply(1:nrow(wiki.list),load.wikiq.files,wiki.list=wiki.list,mc.preschedule=F) # d.list <- lapply(1:nrow(wiki.list),wiki.list=wiki.list,load.wikiq.files) all.edits <- rbindlist(d.list) all.edits[, ":="(time.first.edit = min(date.time), time.last.edit = max(date.time)), by=.(editor.id, wiki.name)] all.edits[, ":="(editor=gsub("\"","",editor), title=gsub("\"","",title), reverteds=gsub("\"","",reverteds))] all.edits <- all.edits[editor != "Default"] all.edits[,month:=floor_date(date.time,unit="month")] all.edits[,,by=.(wiki.name,editor)] setkey(all.edits,wiki.name,editor.id,date.time) ## fix the definition of session to edits that have less than 1 hour together all.edits[,":="(time.since.last.edit = diff(c(first(time.first.edit),date.time),lag=1,differences=1), time.till.next.edit = diff(c(date.time,last(time.last.edit))),lag=1,differences=1, editor.tenure =as.duration(max(date.time)-min(date.time))), by=.(editor.id,wiki.name)] all.edits[,":="(new.session = time.since.last.edit > session.window.length),by=.(editor.id,wiki.name)] all.edits[,":="(nth.session = cumsum(new.session)),by=.(editor.id,wiki.name)] all.edits[,":="(in.early.session = nth.session < n.early.period.sessions)] all.edits[, ":="(is.reverted = any(reverted), is.deleted = any(deleted), p.reverted = mean(reverted & namespace ==0), n.first.session=nrow(.SD[in.early.session==TRUE])), by=.(editor.id,wiki.name)] all.edits[,":="(age = as.duration(date.time - time.first.edit))] all.edits[,":="(last.wiki.edit = max(date.time)),by=.(wiki.name)] all.edits[,":="(is.newcomer = (age < newcomer.period) & (as.duration(last.wiki.edit - time.first.edit) > as.duration(newcomer.sunset)) & !anon)] ## did rejecting editors leave a comment on the talk page? return(all.edits) } add.userroles <- function(all.edits,bots,admins){ bots[,":="(wiki.name = wiki, editor = user ), by=.(wiki,user) ] admins[,":="(wiki.name = wiki, editor = user), by=.(wiki,user)] all.edits[bots, ":="( is.bot = i.is.bot ), on=.(wiki.name, editor, date.time >= role.period.begin, date.time <= role.period.end) ] all.edits[admins, ":="( is.admin = i.is.admin ), on=.(wiki.name, editor, date.time >= role.period.begin, date.time <= role.period.end) ] all.edits[,":="(is.bot = ifelse(is.na(is.bot),FALSE,is.bot), is.admin = ifelse(is.na(is.admin),FALSE,is.admin))] all.edits[,":="(is.newcomer = (is.newcomer & !is.bot))] return(all.edits) } identify.revert.messages <- function(all.edits, discussion.window = as.difftime(7,units="days"),week.length=as.difftime(7,units="days")){ all.edits[,user.talk:=as.factor(paste0("User talk:",as.character(all.edits$editor)))] ## join the talk page edits wit all.edits[namespace==0,talk:=as.factor(paste0("Talk:",as.character(all.edits[namespace==0]$title)))] print(" identifying reverts") all.edits[!is.na(reverteds),reverted.edits := lapply(strsplit(reverteds,","),strtoi)] all.edits[!is.na(reverteds),N.reverteds := lapply(reverted.edits,length)] ns.edits = all.edits[namespace==0 | namespace==4] reverted.lookup <- ns.edits[!is.na(reverteds), .(revid = unlist(reverted.edits), wiki.name = rep(wiki.name,N.reverteds), reverted.by = rep(editor,N.reverteds), reverted.by.bot = rep(is.bot, N.reverteds), reverted.by.admin = rep(is.admin, N.reverteds), revert.date.time = rep(date.time,N.reverteds), revert.id = rep(revid,N.reverteds))] reverted.edits <- ns.edits[reverted==TRUE] reverted.edits[reverted.lookup, ":="(reverted.by = i.reverted.by, reverted.by.bot = i.reverted.by.bot, reverted.by.admin = i.reverted.by.admin, revert.date.time = i.revert.date.time, revert.id = revert.id), on=.(wiki.name,revid)] reverted.edits[,message.window.end:= revert.date.time + discussion.window] ## merge back revert info to all.edits all.edits[reverted.edits,":="( reverted.by = i.reverted.by, reverted.by.bot = i.reverted.by.bot, reverted.by.admin = i.reverted.by.admin, revert.date.time = i.revert.date.time, revert.id = revert.id, message.window.end = message.window.end), on = .(wiki.name, revid)] print(" done") print(" identifying editor talk page edits") ns0.edits = all.edits[namespace==0] ## we want talkers who talk before the end of the window talk.page.edits = all.edits[namespace==1] talk.page.edits[,talk:=title] ## we only need to keep the key identifier for each revert ## use editor + title instead of revid since editors may have more than ## one edit reverted by a given revert.id. ## key = wiki.name,editor,title,revert.id, setkeyv(reverted.edits,c("wiki.name","editor","title","revert.id")) ## condition where editor discusses after being reverted editor.talks <- reverted.edits[talk.page.edits, .( wiki.name, editor = x.editor, revert.id = x.revert.id, talk.id = i.revid, talk.date.time=i.date.time ) ,on=.(editor, wiki.name, talk, revert.date.time=date.time) ,nomatch=0L] editor.talks <- editor.talks[, .( editor.talks = TRUE, time.editor.talks = min(talk.date.time), editor.talks.revid = min(talk.id) ), by = .(wiki.name,editor,revert.id) ] ## merge back reverted edits to all.edits all.edits[editor.talks, ":="(editor.talks = editor.talks, time.editor.talks = time.editor.talks, editor.talks.revid=editor.talks.revid), on=.(wiki.name,editor,revert.id)] ## tidy up rm(editor.talks, reverted.lookup) print(" done") print(" identifying reverter talk page edits") all.edits[,":="(response.window.end = time.editor.talks + discussion.window)] all.edits[(reverted==TRUE & is.na(editor.talks)), editor.talks := FALSE] ns0.edits = all.edits[namespace==0] reverted.edits <- ns0.edits[reverted==TRUE] talk.page.edits <- all.edits[namespace==1] talk.page.edits[,":="(talk = title,reverted.by=editor)] # the key is still wiki.name, editor, revert.id reverter.talks <- reverted.edits[talk.page.edits, .( wiki.name = wiki.name, editor = x.editor, revert.id = x.revert.id, revert.date.time = x.revert.date.time, time.reverter.talks = i.date.time, reverter.talk.id = i.revid ), ,on=.(reverted.by, wiki.name, talk, revert.date.time=date.time), nomatch=0L] reverter.talks <- reverter.talks[time.reverter.talks > revert.date.time, .( reverter.talks = TRUE, time.reverter.talks = min(time.reverter.talks), reverter.talk.id = min(reverter.talk.id) ), by=.(wiki.name,editor,revert.id) ] ## merge back reverted.edits to all.edits all.edits[reverter.talks, ":="(reverter.talks = reverter.talks, time.reverter.talks = time.reverter.talks, reverter.talk.id = reverter.talk.id), on=.(wiki.name,editor,revert.id)] ## tidy up rm(reverter.talks,talk.page.edits) all.edits[(reverted == TRUE) & (is.na(reverter.talks)), reverter.talks := FALSE] # if the editor didn't talk first, the time window is different all.edits[reverter.talks == TRUE, editor.talks.first := (time.editor.talks < time.reverter.talks)] all.edits[(reverter.talks == TRUE) & (editor.talks.first==FALSE), reverter.talks := time.reverter.talks < (date.time + discussion.window)] print(" done") print(" identifying User talk page edits") ## now do the same thing but for user talk pages ## did the reverter post on the editor's user talk page? ## key is wiki.name, title, reverted.by, revert.id ns0.edits = all.edits[namespace==0] user.talk.edits = all.edits[namespace==3] user.talk.edits[,":="(reverted.by=editor,user.talk=title)] reverted.edits = ns0.edits[reverted==TRUE] reverter.messages = reverted.edits[user.talk.edits, .(wiki.name = x.wiki.name, title = x.title, revert.id = x.revert.id, editor = x.editor, reverted.by = i.reverted.by, time.reverter.messages=i.date.time, reverter.messages.id=i.revid), on=.(wiki.name, reverted.by, user.talk, revert.date.time <= date.time, message.window.end >= date.time ), nomatch=0L] reverter.messages = reverter.messages[,.(reverter.messages = TRUE, time.reverter.messages = min(time.reverter.messages), reverter.message.id = min(reverter.messages.id)), by=.(wiki.name, editor, reverted.by, revert.id)] reverted.edits[reverter.messages,":="(reverter.messages = reverter.messages, time.reverter.messages = time.reverter.messages, reverter.message.id = reverter.message.id), on=.(wiki.name, editor, revert.id)] reverted.edits[is.na(reverter.messages), reverter.messages := FALSE] all.edits[reverted.edits,":="(reverter.messages = reverter.messages, time.reverter.messages = time.reverter.messages, reverter.message.id = reverter.message.id), on=.(wiki.name, editor, revert.id)] ## set some wiki-level variables print(" creating wiki windows") setorderv(all.edits,cols=c("wiki.name","date.time","articleid"),order=1L) all.edits[,":="(chars.change = diff(c(0L,text.chars),lag=1,differences=1), creates.article = (date.time == min(date.time)) ),by=.(wiki.name,articleid)] setorderv(all.edits,cols=c("wiki.name","date.time","articleid"),order=1L) # Some wikis got created by Wikia - invalidating wiki age that doesn't remove this editor all.edits[,":="(wiki.birth.date = min(date.time)),by=.(wiki.name)] all.edits[,":="(total.wiki.length = cumsum(chars.change), n.articles = cumsum(creates.article), wiki.age = as.duration(date.time - wiki.birth.date), year = year(date.time) ),by=.(wiki.name)] all.edits[,":="(wiki.age.months = floor(as.double(wiki.age,units='days')/30), wiki.age.years = floor(as.double(wiki.age,units='years')))] ## generate breaks at precisely 1 week +/- the first edit. date.range <- all.edits[,.(first.edit = min(date.time),last.edit = max(date.time)), by = .(wiki.name)] window.breaks <- date.range[,.(breaks = seq(trunc(first.edit,"days"), trunc(last.edit,"days"), by=week.length), break.next = seq(trunc(first.edit+week.length,"days"), trunc(last.edit+week.length,"days"), by=week.length)), by=.(wiki.name)] window.breaks[, ":="(i.break = 1:length(breaks)) ,by=(wiki.name)] all.edits[window.breaks, ":="(week = i.break ), on=.(wiki.name, date.time <=break.next,date.time >=breaks)] print(" done") ## tidy up all.edits[,":="(reverted.edits = NULL, N.reverteds = NULL, user = NULL, user.talk = NULL, talk=NULL, message.window.end=NULL, response.window.end=NULL)] print(" done") rm(reverted.edits,reverter.messages,user.talk.edits,ns0.edits) return(all.edits) } build.newcomers <- function(all.edits, newcomer.period = duration(60,unit="days"), newcomer.sunset= duration(30*6,unit="days") ){ setkeyv(all.edits,'date.time') all.edits[,":="(time.last.edit.to.wiki = max(date.time)), by=.(wiki.name)] all.edits <- all.edits[,time.till.page.edit := c(diff(date.time),as.numeric(NA)),by=.(wiki.name,articleid)] all.edits <- all.edits[,last.edit.to.page :=is.na(time.till.page.edit)] all.edits[last.edit.to.page == TRUE,time.till.page.edit := time.last.edit.to.wiki-date.time] all.edits <- all.edits[,time.till.page.edit := log1p(as.numeric(time.till.page.edit,units='days'))] editor.variables <- all.edits[, .(survives = any( (age > newcomer.period) & (age < newcomer.sunset)),anon=first(anon),is.bot=any(is.bot),is.admin=any(is.admin)), by = .(wiki.name,editor) ] first.session.edits <- all.edits[in.early.session==TRUE] first.session.edits[,":="(end.newcomer.period = time.first.edit + newcomer.period)] print(" aggregating newcomer activity within wikis") newcomers <- first.session.edits[namespace == 0, .( is.reverted = any(reverted & reverted.by != editor), p.reverted = first(p.reverted), is.bot.reverted = any(reverted.by.bot), is.admin.reverted = any(reverted.by.admin), is.reverted.messaged = any(reverter.messages | reverter.talks,na.rm=TRUE), reverter.talks = any(reverter.talks, na.rm=TRUE), reverter.messages = any(reverter.messages, na.rm=TRUE), editor.talks = any(editor.talks,na.rm=TRUE), time.next.page.edit = min(time.till.next.edit, na.rm=TRUE), BRD.initiation = any(editor.talks & (editor.talks.first | !reverter.talks), na.rm = TRUE), BRD.reciprocation = any(editor.talks & editor.talks.first & reverter.talks, na.rm = TRUE), reverter.initates.BRD = any(reverter.talks & (!editor.talks.first | is.na(editor.talks.first)),na.rm=TRUE), time.first.edit = first(time.first.edit), time.till.page.edit = min(time.till.page.edit), last.edit.to.page = all(last.edit.to.page), end.newcomer.period = first(end.newcomer.period), week = first(week), year = first(year(time.first.edit)), newcomer.edits = .N, session.edits = first(n.first.session), ns0.edits = sum(namespace == 0), ns1.edits = sum(namespace == 1), ns4.edits = sum(namespace == 4), newcomer.chars.change = sum(chars.change), newcomer.creates.article = any(creates.article), wiki.type = first(wiki.type), wiki.age = first(wiki.age) ), by = .(wiki.name, editor) ] newcomers[editor.variables,":="(survives = survives,is.bot=is.bot,is.admin=is.admin), on=.(wiki.name,editor)] newcomers <- newcomers[!is.bot & !is.admin] print(" done") print(" identifying newcomer activity on other wikis") newcomer.prior.wikis <- first.session.edits[newcomers, .( editor = editor, wiki.name = i.wiki.name, other.wiki = x.wiki.name, time.first.edit.this = i.time.first.edit, time.first.edit.other = x.time.first.edit ), on=.(wiki.type,editor,time.first.edit < time.first.edit), nomatch=0L, allow.cartesian = TRUE ] # using < time first edit should exlude edits to this wiki newcomer.prior.wikis <- newcomer.prior.wikis[,.(n.edits.other = .N), by=.(editor,wiki.name,other.wiki)] newcomer.prior.wikis <- newcomer.prior.wikis[, .(n.other.wikis = .N, n.edits.other = sum(n.edits.other)), by=.(wiki.name,editor)] newcomer.prior.wikis <- newcomer.prior.wikis[newcomers, .( wiki.name=wiki.name, editor=editor, n.other.wikis = n.other.wikis, n.edits.other = n.edits.other, has.edited.other.wikis = (n.other.wikis > 0) & (!is.na(n.other.wikis))), on=.(wiki.name,editor), nomatch=NA] newcomers <- newcomers[newcomer.prior.wikis, ":="(n.other.wikis = ifelse(is.na(i.n.other.wikis),0,i.n.other.wikis), n.edits.other = ifelse(is.na(i.n.edits.other),0,i.n.edits.other), has.edited.other.wikis = (i.n.other.wikis > 0) & (!is.na(i.n.other.wikis))), on=.(wiki.name, editor) ] newcomers[,":="(has.edited.other.wikis = ifelse(is.na(has.edited.other.wikis),FALSE,has.edited.other.wikis), n.edits.other = ifelse(is.na(n.edits.other),0,n.edits.other), n.other.wikis = ifelse(is.na(n.other.wikis),0,n.other.wikis) )] print(" done") print(" identifying all messages") user.talk.edits <- all.edits[namespace==3] user.talk.edits[,user.talk:=title] newcomers[,user.talk:= as.factor(paste0("User talk:",as.character(editor)))] newcomer.messages <- user.talk.edits[newcomers, .( editor = i.editor, n.messages = .N, end.newcomer.period = i.end.newcomer.period ), on=.(wiki.name,user.talk,date.time <= end.newcomer.period), by=.EACHI, nomatch=0L] newcomer.messages <- newcomer.messages[newcomers, .(wiki.name, editor, n.messages = x.n.messages, is.messaged = (x.n.messages > 0) & (!is.na(x.n.messages))), on=.(wiki.name,editor), nomatch = NA] newcomers <- newcomers[newcomer.messages, ":="(n.messages = ifelse(is.na(i.n.messages),0L,i.n.messages), is.messaged = ifelse(is.na(i.n.messages),FALSE,i.is.messaged)), on=.(wiki.name,editor)] last.edit <- max(all.edits$date.time) last.wikia.edit <- max(all.edits[wiki.type=="wikia",date.time]) newcomers <- newcomers[time.first.edit < last.edit - as.difftime(60,units="days")] newcomers <- newcomers[(wiki.type == "wikia") & (time.first.edit < (last.wikia.edit - as.difftime(60,units="days")))] print(" done") return(newcomers) } build.namespace4.dataset <- function(all.edits, week.length = as.difftime(7,units="days")){ ns4.reg.edits <- all.edits[(namespace==4) & (anon==FALSE)] return(ns4.reg.edits) } build.wiki.level.variables <- function(all.edits, week.length = as.difftime(7,units="days")){ wiki.data <- all.edits[,.(n.editors = length(unique(editor)), total.wiki.length=last(total.wiki.length) ) ,by=.(wiki.name,week)] wiki.ns4.data <- all.edits[namespace==4, .(n.ns4.edits = .N, n.ns4.editors = length(unique(editor)), d.ns4.length = sum(chars.change), ns4.editor.age = mean(age) ), by=.(wiki.name, week)] wiki.ns0.data <- all.edits[namespace==0, .(revert.rate = mean(reverted,na.rm=TRUE), newcomer.revert.rate = sum((reverted & is.newcomer),na.rm=TRUE)/sum(is.newcomer,na.rm=TRUE), revert.disc.rate = sum((reverted & reverter.talks),na.rm=TRUE)/sum(reverted,na.rm=TRUE), newcomer.revert.disc.rate = sum((reverted & reverter.talks & is.newcomer),na.rm=TRUE)/ sum(reverted & is.newcomer,na.rm=TRUE), revert.message.rate = sum((reverted & reverter.messages),na.rm=TRUE)/sum(reverted,na.rm=TRUE), newcomer.revert.message.rate = sum((reverted & reverter.messages & is.newcomer),na.rm=TRUE)/sum((reverted & is.newcomer),na.rm=TRUE), newcomer.edits.rate = mean(is.newcomer,na.rm=TRUE), bot.revert.rate = mean(reverted.by.bot,na.rm=TRUE), bot.revert.prop = sum(reverted.by.bot,na.rm=TRUE)/sum(reverted,na.rm=TRUE), newcomer.bot.revert.rate = mean((reverted.by.bot & is.newcomer),na.rm=TRUE), newcomer.bot.revert.prop = sum((reverted.by.bot & is.newcomer),na.rm=TRUE)/sum((reverted & is.newcomer),na.rm=TRUE), admin.revert.rate = mean(reverted.by.admin,na.rm=TRUE), admin.revert.prop = sum(reverted.by.admin,na.rm=TRUE)/sum(reverted,na.rm=TRUE), year = year(first(date.time)), month = month(first(date.time))), by=.(wiki.name,week)] ## replace NAs with 0 wiki.ns0.data[, ":="( # revert.rate = ifelse(is.na(revert.rate),0,revert.rate), revert.disc.rate = ifelse(is.na(revert.disc.rate),0,revert.disc.rate), newcomer.revert.disc.rate = ifelse(is.na(newcomer.revert.disc.rate),0,newcomer.revert.disc.rate), revert.message.rate = ifelse(is.na(revert.message.rate),0,revert.message.rate), newcomer.revert.message.rate = ifelse(is.na(newcomer.revert.message.rate),0,newcomer.revert.message.rate), newcomer.edits.rate = ifelse(is.na(newcomer.edits.rate),0,newcomer.edits.rate), bot.revert.rate = ifelse(is.na(bot.revert.rate),0,bot.revert.rate), bot.revert.prop = ifelse(is.na(bot.revert.prop),0,bot.revert.prop), newcomer.bot.revert.rate = ifelse(is.na(newcomer.bot.revert.rate),0,newcomer.bot.revert.rate), newcomer.bot.revert.prop = ifelse(is.na(newcomer.bot.revert.prop),0,newcomer.bot.revert.prop), admin.revert.rate = ifelse(is.na(admin.revert.rate),0,admin.revert.rate), admin.revert.prop = ifelse(is.na(admin.revert.prop),0,admin.revert.prop)), ] ## bring it together wiki.data[wiki.ns0.data, ":="( revert.rate = i.revert.rate, revert.disc.rate = i.revert.disc.rate, newcomer.revert.disc.rate = i.newcomer.revert.disc.rate, revert.message.rate = i.revert.message.rate, newcomer.revert.message.rate = i.newcomer.revert.message.rate, newcomer.edits.rate = i.newcomer.edits.rate, bot.revert.rate = i.bot.revert.rate, bot.revert.prop = i.bot.revert.prop, newcomer.bot.revert.rate = i.newcomer.bot.revert.rate, newcomer.bot.revert.prop = i.newcomer.bot.revert.prop, admin.revert.rate = i.admin.revert.rate, admin.revert.prop = i.admin.revert.prop), on=.(wiki.name,week)] wiki.data[wiki.ns4.data, ":="( n.ns4.edits = i.n.ns4.edits, n.ns4.editors = i.n.ns4.editors, d.ns4.length = i.d.ns4.length, ns4.editor.age = i.ns4.editor.age ), on=.(wiki.name,week)] # create variables for community size in standard deviation units return(wiki.data) } load.all.edits <- function(){ if(!exists("all.edits")){ file.name <- "all.edits.RDS" if(!file.exists(file.name)){ print("loading wikiq data") all.edits <- build.newcomer.table.step1(wiki.list, newcomer.period = newcomer.period) print("done") print("adding user role data") all.edits <- add.userroles(all.edits,bots=bots,admins=admins) print("done") print("identifying reverts and messages") all.edits <- identify.revert.messages(all.edits,week.length=as.difftime(7,units="days")) print("done") if(!nosave){ print("saving work") saveRDS(all.edits,file.name) print("done") } } else{ print("loading wikiq data with reverts and messages") all.edits <- readRDS(file.name) print("done") } remember(min(all.edits$date.time),"earliest.data.point") remember(max(all.edits$date.time),"latest.data.point") ## make all.edits a global variable all.edits <<- all.edits } } newcomer.period = duration(2*30,unit="days") newcomer.sunset = duration(30*6,unit="days") week.length=duration(7,unit="days") remember(newcomer.period) remember(newcomer.sunset) remember(week.length) ## try loading newcomers if(!exists("newcomers")){ file.name2 <- "newcomers.RDS" if(file.exists(file.name2)){ newcomers <- readRDS(file.name2) } else{ print("building newcomers table") load.all.edits() newcomers <- build.newcomers(all.edits, newcomer.sunset = newcomer.sunset, newcomer.period=newcomer.period) print("done") print("saving work") if(!nosave){ saveRDS(newcomers,file.name2) } } } if(!exists("ns4.reg.edits")){ file.name <- "ns4.reg.edits.RDS" if(file.exists(file.name)){ ns4.reg.edits <- readRDS(file.name) } else{ print("building ns4 edits table") ## create table of namespace 4 edits from all edits load.all.edits() ns4.reg.edits <- build.namespace4.dataset(all.edits) print("done") print("saving work") if(!nosave){ saveRDS(ns4.reg.edits,file.name) } } } if(!exists("wiki.data")){ file.name3 <- "wikiweeks.RDS" if(!file.exists(file.name3)){ print("building wiki level variable") load.all.edits() wiki.data <- build.wiki.level.variables(all.edits, week.length=week.length) print("done") print("saving work") if(!nosave){ saveRDS(wiki.data,file.name3) } print("done") } else{ wiki.data <- readRDS(file.name3) } } #wikis.to.remove <- newcomers[,.N,by="wiki.name"][N<30]$wiki.name #remember(nrow(wikis.to.remove),"n.wikis.insufficient.newcomers") #newcomers <- newcomers[!(wiki.name %in% wikis.to.remove)] #all.edits <- all.edits[!(wiki.name %in% wikis.to.remove)] if(!exists("wiki.stats")){ file.name <- "wiki.stats.RDS" if(!file.exists(file.name)){ load.all.edits() editor.tenures <- all.edits[,.(tenure=first(editor.tenure)),by=.(wiki.name,editor)] wiki.stats <- all.edits[,.(total.editors = length(unique(editor)), total.edits = .N, total.reverts = sum(reverted), total.bot.reverts = sum(reverted.by.bot,na.rm=TRUE), total.ns4.edits = nrow(.SD[namespace==4]), med.edit.tenure = median(editor.tenure) ),by=.(wiki.name)] med.editor.tenure <- editor.tenures[,.(med.editor.tenure=median(tenure)),by=.(wiki.name)] wiki.stats[med.editor.tenure,med.tenure := med.editor.tenure,on="wiki.name"] newcomer.stats <- newcomers[,.(retention.rate = mean(survives), reverted.newcomers = sum(is.reverted) ),by=.(wiki.name)] wiki.stats <- wiki.stats[newcomer.stats,':='(retention.rate = retention.rate, reverted.newcomers = reverted.newcomers), on="wiki.name"] remember(wiki.stats,silent=TRUE) saveRDS(wiki.stats,file.name) } else { wiki.stats <- readRDS("wiki.stats.RDS") } } row1 <- c("total.editors","total.reverts","total.bot.reverts","total.ns4.edits") row2 <- c("med.editor.tenure","retention.rate") m.wiki.stats <- melt(wiki.stats,id='wiki.name',measure.vars = c("total.editors","total.reverts","total.bot.reverts","total.ns4.edits")) m.wiki.stats[variable %in% row1, ":="(row = 1,col=which(row1 == variable,useNames=F)),by=variable] m.wiki.stats[variable %in% row2, ":="(row = 2,col=which(row2 == variable,useNames=F)),by=variable] m.wiki.stats <- m.wiki.stats[value != 0 | variable != "total.bot.reverts"] m.wiki.stats <- m.wiki.stats[value == 0 & variable != "total.bot.reverts", value := 1] friendly.var <- function(varname){ sapply(as.character(varname),function(f) switch(f, total.editors='Editors', total.reverts='Reverts', total.bot.reverts='Bot reverts', total.ns4.edits='Edits to the project namespace')) } var.id <- function(varname){ sapply(as.character(varname),function(f) switch(f, total.editors=1, total.reverts=2, total.bot.reverts=3, total.ns4.edits=4)) } med.line.width <- 1 m.wiki.stats[,variable := friendly.var(variable)] m.wiki.stats <- m.wiki.stats[,variable:=factor(variable,levels=c('Editors',"Reverts","Bot reverts","Edits to the project namespace"))] spoke.data <- m.wiki.stats[,.(y = median(value)),by=variable] remember(m.wiki.stats) remember(spoke.data) remember(nrow(wiki.stats),"n.wikia.wikis") ## join wiki-level variables with newcomer variables to get ready to model newcomer retention. newcomers <- newcomers[wiki.data, ":="( wiki.name=i.wiki.name, week = i.week, n.editors = i.n.editors, total.wiki.length = i.total.wiki.length, revert.rate = i.revert.rate, revert.disc.rate = i.revert.disc.rate, newcomer.revert.disc.rate = i.newcomer.revert.disc.rate, revert.message.rate = i.revert.message.rate, newcomer.revert.message.rate = i.newcomer.revert.message.rate, newcomer.edits.rate = i.newcomer.edits.rate, bot.revert.rate = i.bot.revert.rate, bot.revert.prop = i.bot.revert.prop, newcomer.bot.revert.rate = i.newcomer.bot.revert.rate, newcomer.bot.revert.prop = i.newcomer.bot.revert.prop, admin.revert.rate = i.admin.revert.rate, admin.revert.prop = i.admin.revert.prop, n.ns4.edits = i.n.ns4.edits, n.ns4.editors = i.n.ns4.editors, d.ns4.length = i.d.ns4.length, ns4.editor.age = i.ns4.editor.age, wiki.age.weeks = as.double(wiki.age,units='days')/7, wiki.age.months = floor(as.double(wiki.age,units='days')/30), wiki.age.half.years = floor(as.double(wiki.age,units='years')*2), wiki.age.years = floor(as.double(wiki.age,units='years')), quarter = factor(floor_date(time.first.edit,unit="3 months")) ), on=.(wiki.name,week) ] survival.data <- newcomers[,.(wiki.name, week, survival.rate = mean(survives), n.newcomers = .N), by = .(wiki.name, week)] wiki.data <- wiki.data[survival.data, ":="( survival.rate = survival.rate, n.newcomers = n.newcomers), on = .(wiki.name,week)] file.name <- "active.editors.RDS" if(!file.exists(file.name)){ load.all.edits() active.editors <- all.edits[, .(N.edits=.N, wiki.age.years=first(wiki.age.years)), by=.(wiki.name, editor, wiki.age.months)] saveRDS(active.editors, file.name) } else { active.editors <- readRDS(file.name) }