1 # Library containing code for processing wikiq tsvs into datasets
2 # Copyright (C) 2018 Nathan TeBlunthuis
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU General Public License for more details.
14 # You should have received a copy of the GNU General Public License
15 # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 ### is it more efficient to develop inside the loop or outside?
20 ## with group by outside mclapply
21 ## user system elapsed
24 ## user system elapsed
25 ## 609.715 592.603 638.172
27 ## with group by inside mclapply
28 ## user system elapsed
31 ## user system elapsed
32 ## 739.826 408.396 596.346
33 ## conclusion: do as much outside mclapply as possible
35 build.newcomer.table.step1 <- function(wiki.list,
36 session.window.length = duration(1,units="hours"),
37 newcomer.period = duration(2*30,units="days"),
38 newcomer.sunset = duration(180,units="days"),
39 n.early.period.sessions = 1){
40 d.list <- mclapply(1:nrow(wiki.list),load.wikiq.files,wiki.list=wiki.list,mc.preschedule=F)
41 # d.list <- lapply(1:nrow(wiki.list),wiki.list=wiki.list,load.wikiq.files)
42 all.edits <- rbindlist(d.list)
45 ":="(time.first.edit = min(date.time),
46 time.last.edit = max(date.time)),
47 by=.(editor.id, wiki.name)]
51 ":="(editor=gsub("\"","",editor),
52 title=gsub("\"","",title),
53 reverteds=gsub("\"","",reverteds))]
55 all.edits <- all.edits[editor != "Default"]
56 all.edits[,month:=floor_date(date.time,unit="month")]
57 all.edits[,,by=.(wiki.name,editor)]
58 setkey(all.edits,wiki.name,editor.id,date.time)
59 ## fix the definition of session to edits that have less than 1 hour together
60 all.edits[,":="(time.since.last.edit = diff(c(first(time.first.edit),date.time),lag=1,differences=1),
61 time.till.next.edit = diff(c(date.time,last(time.last.edit))),lag=1,differences=1,
62 editor.tenure =as.duration(max(date.time)-min(date.time))),
63 by=.(editor.id,wiki.name)]
65 all.edits[,":="(new.session = time.since.last.edit > session.window.length),by=.(editor.id,wiki.name)]
66 all.edits[,":="(nth.session = cumsum(new.session)),by=.(editor.id,wiki.name)]
67 all.edits[,":="(in.early.session = nth.session < n.early.period.sessions)]
70 ":="(is.reverted = any(reverted),
71 is.deleted = any(deleted),
72 p.reverted = mean(reverted & namespace ==0),
73 n.first.session=nrow(.SD[in.early.session==TRUE])),
74 by=.(editor.id,wiki.name)]
75 all.edits[,":="(age = as.duration(date.time - time.first.edit))]
77 all.edits[,":="(last.wiki.edit = max(date.time)),by=.(wiki.name)]
78 all.edits[,":="(is.newcomer = (age < newcomer.period) & (as.duration(last.wiki.edit - time.first.edit) > as.duration(newcomer.sunset)) & !anon)]
80 ## did rejecting editors leave a comment on the talk page?
84 add.userroles <- function(all.edits,bots,admins){
86 bots[,":="(wiki.name = wiki,
92 admins[,":="(wiki.name = wiki,
102 date.time >= role.period.begin,
103 date.time <= role.period.end)
108 is.admin = i.is.admin
112 date.time >= role.period.begin,
113 date.time <= role.period.end)
116 all.edits[,":="(is.bot = ifelse(is.na(is.bot),FALSE,is.bot),
117 is.admin = ifelse(is.na(is.admin),FALSE,is.admin))]
119 all.edits[,":="(is.newcomer = (is.newcomer & !is.bot))]
123 identify.revert.messages <- function(all.edits, discussion.window = as.difftime(7,units="days"),week.length=as.difftime(7,units="days")){
125 all.edits[,user.talk:=as.factor(paste0("User talk:",as.character(all.edits$editor)))]
127 ## join the talk page edits wit
128 all.edits[namespace==0,talk:=as.factor(paste0("Talk:",as.character(all.edits[namespace==0]$title)))]
130 print(" identifying reverts")
131 all.edits[!is.na(reverteds),reverted.edits := lapply(strsplit(reverteds,","),strtoi)]
133 all.edits[!is.na(reverteds),N.reverteds := lapply(reverted.edits,length)]
135 ns.edits = all.edits[namespace==0 | namespace==4]
137 reverted.lookup <- ns.edits[!is.na(reverteds),
138 .(revid = unlist(reverted.edits),
139 wiki.name = rep(wiki.name,N.reverteds),
140 reverted.by = rep(editor,N.reverteds),
141 reverted.by.bot = rep(is.bot, N.reverteds),
142 reverted.by.admin = rep(is.admin, N.reverteds),
143 revert.date.time = rep(date.time,N.reverteds),
144 revert.id = rep(revid,N.reverteds))]
146 reverted.edits <- ns.edits[reverted==TRUE]
148 reverted.edits[reverted.lookup,
149 ":="(reverted.by = i.reverted.by,
150 reverted.by.bot = i.reverted.by.bot,
151 reverted.by.admin = i.reverted.by.admin,
152 revert.date.time = i.revert.date.time,
153 revert.id = revert.id),
154 on=.(wiki.name,revid)]
156 reverted.edits[,message.window.end:= revert.date.time + discussion.window]
158 ## merge back revert info to all.edits
159 all.edits[reverted.edits,":="(
160 reverted.by = i.reverted.by,
161 reverted.by.bot = i.reverted.by.bot,
162 reverted.by.admin = i.reverted.by.admin,
163 revert.date.time = i.revert.date.time,
164 revert.id = revert.id,
165 message.window.end = message.window.end),
166 on = .(wiki.name, revid)]
169 print(" identifying editor talk page edits")
170 ns0.edits = all.edits[namespace==0]
172 ## we want talkers who talk before the end of the window
173 talk.page.edits = all.edits[namespace==1]
174 talk.page.edits[,talk:=title]
177 ## we only need to keep the key identifier for each revert
178 ## use editor + title instead of revid since editors may have more than
179 ## one edit reverted by a given revert.id.
180 ## key = wiki.name,editor,title,revert.id,
182 setkeyv(reverted.edits,c("wiki.name","editor","title","revert.id"))
183 ## condition where editor discusses after being reverted
184 editor.talks <- reverted.edits[talk.page.edits,
188 revert.id = x.revert.id,
190 talk.date.time=i.date.time
195 revert.date.time<date.time,
196 message.window.end>=date.time)
199 editor.talks <- editor.talks[,
202 time.editor.talks = min(talk.date.time),
203 editor.talks.revid = min(talk.id)
205 by = .(wiki.name,editor,revert.id)
208 ## merge back reverted edits to all.edits
209 all.edits[editor.talks,
210 ":="(editor.talks = editor.talks,
211 time.editor.talks = time.editor.talks,
212 editor.talks.revid=editor.talks.revid),
213 on=.(wiki.name,editor,revert.id)]
216 rm(editor.talks, reverted.lookup)
219 print(" identifying reverter talk page edits")
220 all.edits[,":="(response.window.end = time.editor.talks + discussion.window)]
221 all.edits[(reverted==TRUE & is.na(editor.talks)), editor.talks := FALSE]
222 ns0.edits = all.edits[namespace==0]
223 reverted.edits <- ns0.edits[reverted==TRUE]
224 talk.page.edits <- all.edits[namespace==1]
225 talk.page.edits[,":="(talk = title,reverted.by=editor)]
227 # the key is still wiki.name, editor, revert.id
228 reverter.talks <- reverted.edits[talk.page.edits,
230 wiki.name = wiki.name,
232 revert.id = x.revert.id,
233 revert.date.time = x.revert.date.time,
234 time.reverter.talks = i.date.time,
235 reverter.talk.id = i.revid
241 revert.date.time<date.time,
242 response.window.end>=date.time),
245 reverter.talks <- reverter.talks[time.reverter.talks > revert.date.time,
247 reverter.talks = TRUE,
248 time.reverter.talks = min(time.reverter.talks),
249 reverter.talk.id = min(reverter.talk.id)
251 by=.(wiki.name,editor,revert.id)
255 ## merge back reverted.edits to all.edits
256 all.edits[reverter.talks,
257 ":="(reverter.talks = reverter.talks,
258 time.reverter.talks = time.reverter.talks,
259 reverter.talk.id = reverter.talk.id),
260 on=.(wiki.name,editor,revert.id)]
263 rm(reverter.talks,talk.page.edits)
265 all.edits[(reverted == TRUE) & (is.na(reverter.talks)), reverter.talks := FALSE]
267 # if the editor didn't talk first, the time window is different
268 all.edits[reverter.talks == TRUE,
269 editor.talks.first := (time.editor.talks < time.reverter.talks)]
271 all.edits[(reverter.talks == TRUE) & (editor.talks.first==FALSE),
272 reverter.talks := time.reverter.talks < (date.time + discussion.window)]
275 print(" identifying User talk page edits")
277 ## now do the same thing but for user talk pages
278 ## did the reverter post on the editor's user talk page?
279 ## key is wiki.name, title, reverted.by, revert.id
280 ns0.edits = all.edits[namespace==0]
281 user.talk.edits = all.edits[namespace==3]
282 user.talk.edits[,":="(reverted.by=editor,user.talk=title)]
283 reverted.edits = ns0.edits[reverted==TRUE]
284 reverter.messages = reverted.edits[user.talk.edits,
285 .(wiki.name = x.wiki.name,
287 revert.id = x.revert.id,
289 reverted.by = i.reverted.by,
290 time.reverter.messages=i.date.time,
291 reverter.messages.id=i.revid),
295 revert.date.time <= date.time,
296 message.window.end >= date.time
300 reverter.messages = reverter.messages[,.(reverter.messages = TRUE,
301 time.reverter.messages = min(time.reverter.messages),
302 reverter.message.id = min(reverter.messages.id)),
303 by=.(wiki.name, editor, reverted.by, revert.id)]
305 reverted.edits[reverter.messages,":="(reverter.messages = reverter.messages,
306 time.reverter.messages = time.reverter.messages,
307 reverter.message.id = reverter.message.id),
308 on=.(wiki.name, editor, revert.id)]
310 reverted.edits[is.na(reverter.messages), reverter.messages := FALSE]
312 all.edits[reverted.edits,":="(reverter.messages = reverter.messages,
313 time.reverter.messages = time.reverter.messages,
314 reverter.message.id = reverter.message.id),
315 on=.(wiki.name, editor, revert.id)]
317 ## set some wiki-level variables
318 print(" creating wiki windows")
319 setorderv(all.edits,cols=c("wiki.name","date.time","articleid"),order=1L)
320 all.edits[,":="(chars.change = diff(c(0L,text.chars),lag=1,differences=1),
321 creates.article = (date.time == min(date.time))
322 ),by=.(wiki.name,articleid)]
324 setorderv(all.edits,cols=c("wiki.name","date.time","articleid"),order=1L)
326 # Some wikis got created by Wikia - invalidating wiki age that doesn't remove this editor
328 all.edits[,":="(wiki.birth.date = min(date.time)),by=.(wiki.name)]
330 all.edits[,":="(total.wiki.length = cumsum(chars.change),
331 n.articles = cumsum(creates.article),
332 wiki.age = as.duration(date.time - wiki.birth.date),
333 year = year(date.time)
336 all.edits[,":="(wiki.age.months = floor(as.double(wiki.age,units='days')/30),
337 wiki.age.years = floor(as.double(wiki.age,units='years')))]
339 ## generate breaks at precisely 1 week +/- the first edit.
340 date.range <- all.edits[,.(first.edit = min(date.time),last.edit = max(date.time)), by = .(wiki.name)]
342 window.breaks <- date.range[,.(breaks = seq(trunc(first.edit,"days"),
343 trunc(last.edit,"days"),
345 break.next = seq(trunc(first.edit+week.length,"days"),
346 trunc(last.edit+week.length,"days"),
351 ":="(i.break = 1:length(breaks))
354 all.edits[window.breaks,
357 on=.(wiki.name, date.time <=break.next,date.time >=breaks)]
361 all.edits[,":="(reverted.edits = NULL,
366 message.window.end=NULL,
367 response.window.end=NULL)]
370 rm(reverted.edits,reverter.messages,user.talk.edits,ns0.edits)
374 build.newcomers <- function(all.edits,
375 newcomer.period = duration(60,unit="days"),
376 newcomer.sunset= duration(30*6,unit="days")
378 setkeyv(all.edits,'date.time')
380 all.edits[,":="(time.last.edit.to.wiki = max(date.time)), by=.(wiki.name)]
382 all.edits <- all.edits[,time.till.page.edit := c(diff(date.time),as.numeric(NA)),by=.(wiki.name,articleid)]
383 all.edits <- all.edits[,last.edit.to.page :=is.na(time.till.page.edit)]
385 all.edits[last.edit.to.page == TRUE,time.till.page.edit := time.last.edit.to.wiki-date.time]
387 all.edits <- all.edits[,time.till.page.edit := log1p(as.numeric(time.till.page.edit,units='days'))]
389 editor.variables <- all.edits[,
390 .(survives = any( (age > newcomer.period) & (age < newcomer.sunset)),anon=first(anon),is.bot=any(is.bot),is.admin=any(is.admin)),
391 by = .(wiki.name,editor)
394 first.session.edits <- all.edits[in.early.session==TRUE]
395 first.session.edits[,":="(end.newcomer.period = time.first.edit + newcomer.period)]
397 print(" aggregating newcomer activity within wikis")
398 newcomers <- first.session.edits[namespace == 0,
400 is.reverted = any(reverted & reverted.by != editor),
401 p.reverted = first(p.reverted),
402 is.bot.reverted = any(reverted.by.bot),
403 is.admin.reverted = any(reverted.by.admin),
404 is.reverted.messaged = any(reverter.messages |
405 reverter.talks,na.rm=TRUE),
406 reverter.talks = any(reverter.talks, na.rm=TRUE),
407 reverter.messages = any(reverter.messages, na.rm=TRUE),
408 editor.talks = any(editor.talks,na.rm=TRUE),
409 time.next.page.edit = min(time.till.next.edit, na.rm=TRUE),
410 BRD.initiation = any(editor.talks &
411 (editor.talks.first |
412 !reverter.talks), na.rm = TRUE),
414 BRD.reciprocation = any(editor.talks &
416 reverter.talks, na.rm = TRUE),
417 reverter.initates.BRD = any(reverter.talks & (!editor.talks.first |
418 is.na(editor.talks.first)),na.rm=TRUE),
419 time.first.edit = first(time.first.edit),
420 time.till.page.edit = min(time.till.page.edit),
421 last.edit.to.page = all(last.edit.to.page),
422 end.newcomer.period = first(end.newcomer.period),
424 year = first(year(time.first.edit)),
426 session.edits = first(n.first.session),
427 ns0.edits = sum(namespace == 0),
428 ns1.edits = sum(namespace == 1),
429 ns4.edits = sum(namespace == 4),
430 newcomer.chars.change = sum(chars.change),
431 newcomer.creates.article = any(creates.article),
432 wiki.type = first(wiki.type),
433 wiki.age = first(wiki.age)
435 by = .(wiki.name, editor)
439 newcomers[editor.variables,":="(survives = survives,is.bot=is.bot,is.admin=is.admin), on=.(wiki.name,editor)]
441 newcomers <- newcomers[!is.bot & !is.admin]
443 print(" identifying newcomer activity on other wikis")
445 newcomer.prior.wikis <- first.session.edits[newcomers,
448 wiki.name = i.wiki.name,
449 other.wiki = x.wiki.name,
450 time.first.edit.this = i.time.first.edit,
451 time.first.edit.other = x.time.first.edit
454 on=.(wiki.type,editor,time.first.edit < time.first.edit),
456 allow.cartesian = TRUE
459 # using < time first edit should exlude edits to this wiki
460 newcomer.prior.wikis <- newcomer.prior.wikis[,.(n.edits.other = .N),
461 by=.(editor,wiki.name,other.wiki)]
463 newcomer.prior.wikis <- newcomer.prior.wikis[,
464 .(n.other.wikis = .N,
465 n.edits.other = sum(n.edits.other)),
466 by=.(wiki.name,editor)]
468 newcomer.prior.wikis <- newcomer.prior.wikis[newcomers,
472 n.other.wikis = n.other.wikis,
473 n.edits.other = n.edits.other,
474 has.edited.other.wikis = (n.other.wikis > 0) & (!is.na(n.other.wikis))),
475 on=.(wiki.name,editor),
478 newcomers <- newcomers[newcomer.prior.wikis,
479 ":="(n.other.wikis = ifelse(is.na(i.n.other.wikis),0,i.n.other.wikis),
480 n.edits.other = ifelse(is.na(i.n.edits.other),0,i.n.edits.other),
481 has.edited.other.wikis = (i.n.other.wikis > 0) & (!is.na(i.n.other.wikis))),
482 on=.(wiki.name, editor)
485 newcomers[,":="(has.edited.other.wikis = ifelse(is.na(has.edited.other.wikis),FALSE,has.edited.other.wikis),
486 n.edits.other = ifelse(is.na(n.edits.other),0,n.edits.other),
487 n.other.wikis = ifelse(is.na(n.other.wikis),0,n.other.wikis)
491 print(" identifying all messages")
493 user.talk.edits <- all.edits[namespace==3]
495 user.talk.edits[,user.talk:=title]
497 newcomers[,user.talk:= as.factor(paste0("User talk:",as.character(editor)))]
499 newcomer.messages <- user.talk.edits[newcomers,
503 end.newcomer.period = i.end.newcomer.period
505 on=.(wiki.name,user.talk,date.time <= end.newcomer.period),
509 newcomer.messages <- newcomer.messages[newcomers,
512 n.messages = x.n.messages,
513 is.messaged = (x.n.messages > 0) & (!is.na(x.n.messages))),
514 on=.(wiki.name,editor),
517 newcomers <- newcomers[newcomer.messages,
518 ":="(n.messages = ifelse(is.na(i.n.messages),0L,i.n.messages),
519 is.messaged = ifelse(is.na(i.n.messages),FALSE,i.is.messaged)),
520 on=.(wiki.name,editor)]
522 last.edit <- max(all.edits$date.time)
523 last.wikia.edit <- max(all.edits[wiki.type=="wikia",date.time])
524 newcomers <- newcomers[time.first.edit < last.edit - as.difftime(60,units="days")]
525 newcomers <- newcomers[(wiki.type == "wikia") & (time.first.edit < (last.wikia.edit - as.difftime(60,units="days")))]
532 build.namespace4.dataset <- function(all.edits, week.length = as.difftime(7,units="days")){
533 ns4.reg.edits <- all.edits[(namespace==4) & (anon==FALSE)]
535 return(ns4.reg.edits)
539 build.wiki.level.variables <- function(all.edits, week.length = as.difftime(7,units="days")){
541 wiki.data <- all.edits[,.(n.editors = length(unique(editor)),
542 total.wiki.length=last(total.wiki.length)
544 ,by=.(wiki.name,week)]
546 wiki.ns4.data <- all.edits[namespace==4,
548 n.ns4.editors = length(unique(editor)),
549 d.ns4.length = sum(chars.change),
550 ns4.editor.age = mean(age)
552 by=.(wiki.name, week)]
554 wiki.ns0.data <- all.edits[namespace==0,
555 .(revert.rate = mean(reverted,na.rm=TRUE),
556 newcomer.revert.rate = sum((reverted & is.newcomer),na.rm=TRUE)/sum(is.newcomer,na.rm=TRUE),
557 revert.disc.rate = sum((reverted & reverter.talks),na.rm=TRUE)/sum(reverted,na.rm=TRUE),
558 newcomer.revert.disc.rate = sum((reverted & reverter.talks & is.newcomer),na.rm=TRUE)/ sum(reverted & is.newcomer,na.rm=TRUE),
559 revert.message.rate = sum((reverted & reverter.messages),na.rm=TRUE)/sum(reverted,na.rm=TRUE),
560 newcomer.revert.message.rate = sum((reverted & reverter.messages & is.newcomer),na.rm=TRUE)/sum((reverted & is.newcomer),na.rm=TRUE),
561 newcomer.edits.rate = mean(is.newcomer,na.rm=TRUE),
562 bot.revert.rate = mean(reverted.by.bot,na.rm=TRUE),
563 bot.revert.prop = sum(reverted.by.bot,na.rm=TRUE)/sum(reverted,na.rm=TRUE),
564 newcomer.bot.revert.rate = mean((reverted.by.bot & is.newcomer),na.rm=TRUE),
565 newcomer.bot.revert.prop = sum((reverted.by.bot & is.newcomer),na.rm=TRUE)/sum((reverted & is.newcomer),na.rm=TRUE),
566 admin.revert.rate = mean(reverted.by.admin,na.rm=TRUE),
567 admin.revert.prop = sum(reverted.by.admin,na.rm=TRUE)/sum(reverted,na.rm=TRUE),
568 year = year(first(date.time)),
569 month = month(first(date.time))),
570 by=.(wiki.name,week)]
572 ## replace NAs with 0
575 # revert.rate = ifelse(is.na(revert.rate),0,revert.rate),
576 revert.disc.rate = ifelse(is.na(revert.disc.rate),0,revert.disc.rate),
577 newcomer.revert.disc.rate = ifelse(is.na(newcomer.revert.disc.rate),0,newcomer.revert.disc.rate),
578 revert.message.rate = ifelse(is.na(revert.message.rate),0,revert.message.rate),
579 newcomer.revert.message.rate = ifelse(is.na(newcomer.revert.message.rate),0,newcomer.revert.message.rate),
580 newcomer.edits.rate = ifelse(is.na(newcomer.edits.rate),0,newcomer.edits.rate),
581 bot.revert.rate = ifelse(is.na(bot.revert.rate),0,bot.revert.rate),
582 bot.revert.prop = ifelse(is.na(bot.revert.prop),0,bot.revert.prop),
583 newcomer.bot.revert.rate = ifelse(is.na(newcomer.bot.revert.rate),0,newcomer.bot.revert.rate),
584 newcomer.bot.revert.prop = ifelse(is.na(newcomer.bot.revert.prop),0,newcomer.bot.revert.prop),
585 admin.revert.rate = ifelse(is.na(admin.revert.rate),0,admin.revert.rate),
586 admin.revert.prop = ifelse(is.na(admin.revert.prop),0,admin.revert.prop)),
590 wiki.data[wiki.ns0.data,
592 revert.rate = i.revert.rate,
593 revert.disc.rate = i.revert.disc.rate,
594 newcomer.revert.disc.rate = i.newcomer.revert.disc.rate,
595 revert.message.rate = i.revert.message.rate,
596 newcomer.revert.message.rate = i.newcomer.revert.message.rate,
597 newcomer.edits.rate = i.newcomer.edits.rate,
598 bot.revert.rate = i.bot.revert.rate,
599 bot.revert.prop = i.bot.revert.prop,
600 newcomer.bot.revert.rate = i.newcomer.bot.revert.rate,
601 newcomer.bot.revert.prop = i.newcomer.bot.revert.prop,
602 admin.revert.rate = i.admin.revert.rate,
603 admin.revert.prop = i.admin.revert.prop),
604 on=.(wiki.name,week)]
606 wiki.data[wiki.ns4.data,
608 n.ns4.edits = i.n.ns4.edits,
609 n.ns4.editors = i.n.ns4.editors,
610 d.ns4.length = i.d.ns4.length,
611 ns4.editor.age = i.ns4.editor.age
613 on=.(wiki.name,week)]
615 # create variables for community size in standard deviation units
620 load.all.edits <- function(){
621 if(!exists("all.edits")){
622 file.name <- "all.edits.RDS"
623 if(!file.exists(file.name)){
624 print("loading wikiq data")
626 all.edits <- build.newcomer.table.step1(wiki.list, newcomer.period = newcomer.period)
630 print("adding user role data")
631 all.edits <- add.userroles(all.edits,bots=bots,admins=admins)
634 print("identifying reverts and messages")
635 all.edits <- identify.revert.messages(all.edits,week.length=as.difftime(7,units="days"))
639 saveRDS(all.edits,file.name)
643 print("loading wikiq data with reverts and messages")
644 all.edits <- readRDS(file.name)
648 remember(min(all.edits$date.time),"earliest.data.point")
649 remember(max(all.edits$date.time),"latest.data.point")
651 ## make all.edits a global variable
652 all.edits <<- all.edits
656 newcomer.period = duration(2*30,unit="days")
657 newcomer.sunset = duration(30*6,unit="days")
658 week.length=duration(7,unit="days")
659 remember(newcomer.period)
660 remember(newcomer.sunset)
661 remember(week.length)
663 ## try loading newcomers
665 if(!exists("newcomers")){
666 file.name2 <- "newcomers.RDS"
667 if(file.exists(file.name2)){
668 newcomers <- readRDS(file.name2)
670 print("building newcomers table")
673 newcomers <- build.newcomers(all.edits,
674 newcomer.sunset = newcomer.sunset,
675 newcomer.period=newcomer.period)
680 saveRDS(newcomers,file.name2)
686 if(!exists("ns4.reg.edits")){
687 file.name <- "ns4.reg.edits.RDS"
688 if(file.exists(file.name)){
689 ns4.reg.edits <- readRDS(file.name)
691 print("building ns4 edits table")
693 ## create table of namespace 4 edits from all edits
695 ns4.reg.edits <- build.namespace4.dataset(all.edits)
699 saveRDS(ns4.reg.edits,file.name)
704 if(!exists("wiki.data")){
705 file.name3 <- "wikiweeks.RDS"
706 if(!file.exists(file.name3)){
707 print("building wiki level variable")
709 wiki.data <- build.wiki.level.variables(all.edits, week.length=week.length)
713 saveRDS(wiki.data,file.name3)
718 wiki.data <- readRDS(file.name3)
722 #wikis.to.remove <- newcomers[,.N,by="wiki.name"][N<30]$wiki.name
723 #remember(nrow(wikis.to.remove),"n.wikis.insufficient.newcomers")
724 #newcomers <- newcomers[!(wiki.name %in% wikis.to.remove)]
725 #all.edits <- all.edits[!(wiki.name %in% wikis.to.remove)]
726 if(!exists("wiki.stats")){
727 file.name <- "wiki.stats.RDS"
728 if(!file.exists(file.name)){
731 editor.tenures <- all.edits[,.(tenure=first(editor.tenure)),by=.(wiki.name,editor)]
732 wiki.stats <- all.edits[,.(total.editors = length(unique(editor)),
734 total.reverts = sum(reverted),
735 total.bot.reverts = sum(reverted.by.bot,na.rm=TRUE),
736 total.ns4.edits = nrow(.SD[namespace==4]),
737 med.edit.tenure = median(editor.tenure)
740 med.editor.tenure <- editor.tenures[,.(med.editor.tenure=median(tenure)),by=.(wiki.name)]
742 wiki.stats[med.editor.tenure,med.tenure := med.editor.tenure,on="wiki.name"]
743 newcomer.stats <- newcomers[,.(retention.rate = mean(survives),
744 reverted.newcomers = sum(is.reverted)
746 wiki.stats <- wiki.stats[newcomer.stats,':='(retention.rate = retention.rate, reverted.newcomers = reverted.newcomers), on="wiki.name"]
747 remember(wiki.stats,silent=TRUE)
748 saveRDS(wiki.stats,file.name)
750 wiki.stats <- readRDS("wiki.stats.RDS")
754 row1 <- c("total.editors","total.reverts","total.bot.reverts","total.ns4.edits")
755 row2 <- c("med.editor.tenure","retention.rate")
756 m.wiki.stats <- melt(wiki.stats,id='wiki.name',measure.vars = c("total.editors","total.reverts","total.bot.reverts","total.ns4.edits"))
757 m.wiki.stats[variable %in% row1, ":="(row = 1,col=which(row1 == variable,useNames=F)),by=variable]
758 m.wiki.stats[variable %in% row2, ":="(row = 2,col=which(row2 == variable,useNames=F)),by=variable]
760 m.wiki.stats <- m.wiki.stats[value != 0 | variable != "total.bot.reverts"]
761 m.wiki.stats <- m.wiki.stats[value == 0 & variable != "total.bot.reverts", value := 1]
763 friendly.var <- function(varname){
764 sapply(as.character(varname),function(f) switch(f,
765 total.editors='Editors',
766 total.reverts='Reverts',
767 total.bot.reverts='Bot reverts',
768 total.ns4.edits='Edits to the project namespace'))
771 var.id <- function(varname){
772 sapply(as.character(varname),function(f) switch(f,
780 m.wiki.stats[,variable := friendly.var(variable)]
781 m.wiki.stats <- m.wiki.stats[,variable:=factor(variable,levels=c('Editors',"Reverts","Bot reverts","Edits to the project namespace"))]
783 spoke.data <- m.wiki.stats[,.(y = median(value)),by=variable]
784 remember(m.wiki.stats)
786 remember(nrow(wiki.stats),"n.wikia.wikis")
788 ## join wiki-level variables with newcomer variables to get ready to model newcomer retention.
789 newcomers <- newcomers[wiki.data,
791 wiki.name=i.wiki.name,
793 n.editors = i.n.editors,
794 total.wiki.length = i.total.wiki.length,
795 revert.rate = i.revert.rate,
796 revert.disc.rate = i.revert.disc.rate,
797 newcomer.revert.disc.rate = i.newcomer.revert.disc.rate,
798 revert.message.rate = i.revert.message.rate,
799 newcomer.revert.message.rate = i.newcomer.revert.message.rate,
800 newcomer.edits.rate = i.newcomer.edits.rate,
801 bot.revert.rate = i.bot.revert.rate,
802 bot.revert.prop = i.bot.revert.prop,
803 newcomer.bot.revert.rate = i.newcomer.bot.revert.rate,
804 newcomer.bot.revert.prop = i.newcomer.bot.revert.prop,
805 admin.revert.rate = i.admin.revert.rate,
806 admin.revert.prop = i.admin.revert.prop,
807 n.ns4.edits = i.n.ns4.edits,
808 n.ns4.editors = i.n.ns4.editors,
809 d.ns4.length = i.d.ns4.length,
810 ns4.editor.age = i.ns4.editor.age,
811 wiki.age.weeks = as.double(wiki.age,units='days')/7,
812 wiki.age.months = floor(as.double(wiki.age,units='days')/30),
813 wiki.age.half.years = floor(as.double(wiki.age,units='years')*2),
814 wiki.age.years = floor(as.double(wiki.age,units='years')),
815 quarter = factor(floor_date(time.first.edit,unit="3 months"))
821 survival.data <- newcomers[,.(wiki.name,
823 survival.rate = mean(survives),
825 by = .(wiki.name, week)]
826 wiki.data <- wiki.data[survival.data,
828 survival.rate = survival.rate,
829 n.newcomers = n.newcomers),
830 on = .(wiki.name,week)]
832 file.name <- "active.editors.RDS"
833 if(!file.exists(file.name)){
835 active.editors <- all.edits[,
837 wiki.age.years=first(wiki.age.years)),
841 saveRDS(active.editors, file.name)
844 active.editors <- readRDS(file.name)