]> code.communitydata.science - rises_declines_wikia_code.git/blob - lib-01-build_newcomer_table.R
add copy of the GPL
[rises_declines_wikia_code.git] / lib-01-build_newcomer_table.R
1 # Library containing code for processing wikiq tsvs into datasets
2 # Copyright (C) 2018  Nathan TeBlunthuis
3
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU General Public License for more details.
13
14 # You should have received a copy of the GNU General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 library(urltools)
18 library(lubridate)
19 ### is it more efficient to develop inside the loop or outside?
20 ## with group by outside mclapply
21 ##  user  system elapsed 
22 ## 3.743   8.112   6.219 
23
24 ##    user  system elapsed 
25 ## 609.715 592.603 638.172 
26
27 ## with group by inside mclapply
28 ##  user  system elapsed 
29 ## 3.670  8.302   5.780 
30
31 ##    user  system elapsed 
32 ## 739.826 408.396 596.346 
33 ## conclusion: do as much outside mclapply as possible
34
35 build.newcomer.table.step1 <- function(wiki.list,
36                                        session.window.length = duration(1,units="hours"),
37                                        newcomer.period = duration(2*30,units="days"),
38                                        newcomer.sunset = duration(180,units="days"),
39                                        n.early.period.sessions = 1){
40     d.list <- mclapply(1:nrow(wiki.list),load.wikiq.files,wiki.list=wiki.list,mc.preschedule=F)
41 #    d.list <- lapply(1:nrow(wiki.list),wiki.list=wiki.list,load.wikiq.files)
42     all.edits <- rbindlist(d.list)
43     
44     all.edits[,
45               ":="(time.first.edit = min(date.time),
46                    time.last.edit = max(date.time)),
47               by=.(editor.id, wiki.name)]
48
49
50     all.edits[,
51               ":="(editor=gsub("\"","",editor),
52                    title=gsub("\"","",title),
53                    reverteds=gsub("\"","",reverteds))]
54
55     all.edits <- all.edits[editor != "Default"]
56     all.edits[,month:=floor_date(date.time,unit="month")]
57     all.edits[,,by=.(wiki.name,editor)]
58     setkey(all.edits,wiki.name,editor.id,date.time)
59     ## fix the definition of session to edits that have less than 1 hour together
60     all.edits[,":="(time.since.last.edit = diff(c(first(time.first.edit),date.time),lag=1,differences=1),
61                     time.till.next.edit = diff(c(date.time,last(time.last.edit))),lag=1,differences=1,
62                     editor.tenure =as.duration(max(date.time)-min(date.time))), 
63               by=.(editor.id,wiki.name)]
64     
65     all.edits[,":="(new.session = time.since.last.edit > session.window.length),by=.(editor.id,wiki.name)]
66     all.edits[,":="(nth.session = cumsum(new.session)),by=.(editor.id,wiki.name)]
67     all.edits[,":="(in.early.session = nth.session < n.early.period.sessions)]
68     
69     all.edits[,
70               ":="(is.reverted = any(reverted),
71                    is.deleted = any(deleted),
72                    p.reverted = mean(reverted & namespace ==0),
73                    n.first.session=nrow(.SD[in.early.session==TRUE])),
74               by=.(editor.id,wiki.name)]
75     all.edits[,":="(age = as.duration(date.time - time.first.edit))]
76
77     all.edits[,":="(last.wiki.edit = max(date.time)),by=.(wiki.name)]
78     all.edits[,":="(is.newcomer = (age < newcomer.period) & (as.duration(last.wiki.edit - time.first.edit) > as.duration(newcomer.sunset)) & !anon)]
79
80     ## did rejecting editors leave a comment on the talk page?
81     return(all.edits)
82 }
83
84 add.userroles <- function(all.edits,bots,admins){
85
86     bots[,":="(wiki.name = wiki,
87                editor = user
88                ),
89          by=.(wiki,user)
90          ]
91
92     admins[,":="(wiki.name = wiki,
93                  editor = user),
94            by=.(wiki,user)]
95     
96     all.edits[bots,
97               ":="(
98                   is.bot = i.is.bot
99               ),
100                   on=.(wiki.name,
101                        editor,
102                        date.time >= role.period.begin,
103                        date.time <= role.period.end)
104               ]
105
106     all.edits[admins,
107               ":="(
108                   is.admin = i.is.admin
109               ),
110                   on=.(wiki.name,
111                        editor,
112                        date.time >= role.period.begin,
113                        date.time <= role.period.end)
114               ]
115               
116     all.edits[,":="(is.bot = ifelse(is.na(is.bot),FALSE,is.bot),
117                     is.admin = ifelse(is.na(is.admin),FALSE,is.admin))]
118
119     all.edits[,":="(is.newcomer = (is.newcomer & !is.bot))]
120     return(all.edits)
121 }
122
123 identify.revert.messages <- function(all.edits, discussion.window = as.difftime(7,units="days"),week.length=as.difftime(7,units="days")){
124
125     all.edits[,user.talk:=as.factor(paste0("User talk:",as.character(all.edits$editor)))]
126
127     ## join the talk page edits wit
128     all.edits[namespace==0,talk:=as.factor(paste0("Talk:",as.character(all.edits[namespace==0]$title)))]
129
130     print("    identifying reverts")
131     all.edits[!is.na(reverteds),reverted.edits := lapply(strsplit(reverteds,","),strtoi)]
132
133     all.edits[!is.na(reverteds),N.reverteds := lapply(reverted.edits,length)]
134
135     ns.edits = all.edits[namespace==0 | namespace==4]
136
137     reverted.lookup <- ns.edits[!is.na(reverteds),
138                                  .(revid = unlist(reverted.edits),
139                                    wiki.name = rep(wiki.name,N.reverteds),
140                                    reverted.by = rep(editor,N.reverteds),
141                                    reverted.by.bot = rep(is.bot, N.reverteds),
142                                    reverted.by.admin = rep(is.admin, N.reverteds),
143                                    revert.date.time = rep(date.time,N.reverteds),
144                                    revert.id = rep(revid,N.reverteds))]
145
146     reverted.edits <- ns.edits[reverted==TRUE]
147
148     reverted.edits[reverted.lookup,
149                    ":="(reverted.by = i.reverted.by,
150                         reverted.by.bot = i.reverted.by.bot,
151                         reverted.by.admin = i.reverted.by.admin,
152                         revert.date.time = i.revert.date.time,
153                         revert.id = revert.id),
154                    on=.(wiki.name,revid)]
155
156     reverted.edits[,message.window.end:= revert.date.time + discussion.window]
157
158     ## merge back revert info to all.edits
159     all.edits[reverted.edits,":="(
160                                  reverted.by = i.reverted.by,
161                                  reverted.by.bot = i.reverted.by.bot,
162                                  reverted.by.admin = i.reverted.by.admin,
163                                  revert.date.time = i.revert.date.time,
164                                  revert.id = revert.id,
165                                  message.window.end = message.window.end),
166               on = .(wiki.name, revid)]
167
168     print("    done")
169     print("    identifying editor talk page edits")
170     ns0.edits = all.edits[namespace==0]
171
172     ## we want talkers who talk before the end of the window
173     talk.page.edits = all.edits[namespace==1]
174     talk.page.edits[,talk:=title]
175
176
177     ## we only need to keep the key identifier for each revert
178     ## use editor + title instead of revid since editors may have more than
179     ## one edit reverted by a given revert.id. 
180     ## key = wiki.name,editor,title,revert.id,
181
182     setkeyv(reverted.edits,c("wiki.name","editor","title","revert.id"))
183     ## condition where editor discusses after being reverted
184     editor.talks <- reverted.edits[talk.page.edits,
185                                    .(
186                                        wiki.name,
187                                        editor = x.editor,
188                                        revert.id = x.revert.id,
189                                        talk.id = i.revid,
190                                        talk.date.time=i.date.time
191                                    )
192                                   ,on=.(editor,
193                                         wiki.name,
194                                         talk,
195                                         revert.date.time<date.time,
196                                         message.window.end>=date.time)
197                                   ,nomatch=0L]
198
199     editor.talks <- editor.talks[,
200                                  .(
201                                      editor.talks = TRUE,
202                                      time.editor.talks = min(talk.date.time),
203                                      editor.talks.revid = min(talk.id)
204                                  ),
205                                  by = .(wiki.name,editor,revert.id)
206                                  ]
207
208     ## merge back reverted edits to all.edits
209     all.edits[editor.talks,
210               ":="(editor.talks = editor.talks,
211                    time.editor.talks = time.editor.talks,
212                    editor.talks.revid=editor.talks.revid),
213               on=.(wiki.name,editor,revert.id)]
214
215     ## tidy up
216     rm(editor.talks, reverted.lookup)
217
218     print("    done")
219     print("    identifying reverter talk page edits")
220     all.edits[,":="(response.window.end = time.editor.talks + discussion.window)]
221     all.edits[(reverted==TRUE & is.na(editor.talks)), editor.talks := FALSE]
222     ns0.edits = all.edits[namespace==0]
223     reverted.edits <- ns0.edits[reverted==TRUE]
224     talk.page.edits <- all.edits[namespace==1]
225     talk.page.edits[,":="(talk = title,reverted.by=editor)]
226
227                                         # the key is still wiki.name, editor, revert.id
228     reverter.talks <- reverted.edits[talk.page.edits,
229                                      .(
230                                          wiki.name = wiki.name,
231                                          editor = x.editor,
232                                          revert.id = x.revert.id,
233                                          revert.date.time = x.revert.date.time,
234                                          time.reverter.talks = i.date.time,
235                                          reverter.talk.id = i.revid
236                                      ),
237                                     ,on=.(reverted.by,
238                                           wiki.name,  
239                                           talk,
240
241                                           revert.date.time<date.time,
242                                           response.window.end>=date.time),
243                                      nomatch=0L]
244
245     reverter.talks <- reverter.talks[time.reverter.talks > revert.date.time,
246                                      .(   
247                                          reverter.talks = TRUE,
248                                          time.reverter.talks = min(time.reverter.talks),
249                                          reverter.talk.id = min(reverter.talk.id)
250                                      ),
251                                      by=.(wiki.name,editor,revert.id)
252                                      ]
253
254
255     ## merge back reverted.edits to all.edits
256     all.edits[reverter.talks,
257               ":="(reverter.talks = reverter.talks,
258                    time.reverter.talks = time.reverter.talks,
259                    reverter.talk.id = reverter.talk.id),
260               on=.(wiki.name,editor,revert.id)]
261
262     ## tidy up
263     rm(reverter.talks,talk.page.edits)
264
265     all.edits[(reverted == TRUE) & (is.na(reverter.talks)), reverter.talks := FALSE]
266
267                                         # if the editor didn't talk first, the time window is different
268     all.edits[reverter.talks == TRUE,
269               editor.talks.first := (time.editor.talks < time.reverter.talks)]
270
271     all.edits[(reverter.talks == TRUE) & (editor.talks.first==FALSE),
272               reverter.talks := time.reverter.talks < (date.time + discussion.window)]
273
274     print("    done")
275     print("    identifying User talk page edits")
276     
277     ## now do the same thing but for user talk pages
278     ## did the reverter post on the editor's user talk page?
279     ## key is wiki.name, title, reverted.by, revert.id
280     ns0.edits = all.edits[namespace==0]
281     user.talk.edits = all.edits[namespace==3]
282     user.talk.edits[,":="(reverted.by=editor,user.talk=title)]
283     reverted.edits = ns0.edits[reverted==TRUE]
284     reverter.messages = reverted.edits[user.talk.edits,
285                                        .(wiki.name = x.wiki.name,
286                                          title = x.title,
287                                          revert.id = x.revert.id,
288                                          editor = x.editor,
289                                          reverted.by = i.reverted.by,
290                                          time.reverter.messages=i.date.time,
291                                          reverter.messages.id=i.revid),
292                                        on=.(wiki.name,
293                                             reverted.by,
294                                             user.talk,
295                                             revert.date.time <= date.time,
296                                             message.window.end >= date.time
297                                             ),
298                                        nomatch=0L]
299
300     reverter.messages = reverter.messages[,.(reverter.messages = TRUE,
301                                              time.reverter.messages = min(time.reverter.messages),
302                                              reverter.message.id = min(reverter.messages.id)),
303                                           by=.(wiki.name, editor, reverted.by, revert.id)]
304
305     reverted.edits[reverter.messages,":="(reverter.messages = reverter.messages,
306                                           time.reverter.messages = time.reverter.messages,
307                                           reverter.message.id = reverter.message.id),
308                    on=.(wiki.name, editor, revert.id)]
309
310     reverted.edits[is.na(reverter.messages), reverter.messages := FALSE]
311     
312     all.edits[reverted.edits,":="(reverter.messages = reverter.messages,
313                                   time.reverter.messages = time.reverter.messages,
314                                   reverter.message.id = reverter.message.id),
315               on=.(wiki.name, editor, revert.id)]
316
317     ## set some wiki-level variables
318     print("    creating wiki windows")
319     setorderv(all.edits,cols=c("wiki.name","date.time","articleid"),order=1L)
320     all.edits[,":="(chars.change = diff(c(0L,text.chars),lag=1,differences=1),
321                     creates.article = (date.time == min(date.time))
322                     ),by=.(wiki.name,articleid)]
323
324     setorderv(all.edits,cols=c("wiki.name","date.time","articleid"),order=1L)
325
326     # Some wikis got created by Wikia - invalidating wiki age that doesn't remove this editor
327
328     all.edits[,":="(wiki.birth.date = min(date.time)),by=.(wiki.name)]
329
330     all.edits[,":="(total.wiki.length = cumsum(chars.change),
331                     n.articles = cumsum(creates.article),
332                     wiki.age = as.duration(date.time - wiki.birth.date),
333                     year = year(date.time)
334                     ),by=.(wiki.name)]
335
336     all.edits[,":="(wiki.age.months = floor(as.double(wiki.age,units='days')/30),
337                     wiki.age.years = floor(as.double(wiki.age,units='years')))]
338     
339     ## generate breaks at precisely 1 week +/- the first edit.
340     date.range <- all.edits[,.(first.edit = min(date.time),last.edit = max(date.time)), by = .(wiki.name)]
341
342     window.breaks <- date.range[,.(breaks = seq(trunc(first.edit,"days"),
343                                                 trunc(last.edit,"days"),
344                                                 by=week.length),
345                                    break.next = seq(trunc(first.edit+week.length,"days"),
346                                                     trunc(last.edit+week.length,"days"),
347                                                     by=week.length)),
348                                 by=.(wiki.name)]
349
350     window.breaks[,
351                   ":="(i.break = 1:length(breaks))
352                  ,by=(wiki.name)]
353     
354     all.edits[window.breaks,
355               ":="(week = i.break
356                    ),
357               on=.(wiki.name, date.time <=break.next,date.time >=breaks)]
358
359     print("   done")
360     ## tidy up 
361     all.edits[,":="(reverted.edits = NULL,
362                     N.reverteds = NULL,
363                     user = NULL,
364                     user.talk = NULL,
365                     talk=NULL,
366                     message.window.end=NULL,
367                     response.window.end=NULL)]
368
369     print("    done")
370     rm(reverted.edits,reverter.messages,user.talk.edits,ns0.edits)
371     return(all.edits)
372 }
373
374 build.newcomers <- function(all.edits,
375                             newcomer.period = duration(60,unit="days"),
376                             newcomer.sunset= duration(30*6,unit="days")
377                             ){
378     setkeyv(all.edits,'date.time')
379
380     all.edits[,":="(time.last.edit.to.wiki = max(date.time)), by=.(wiki.name)]
381
382     all.edits <- all.edits[,time.till.page.edit := c(diff(date.time),as.numeric(NA)),by=.(wiki.name,articleid)]
383     all.edits <- all.edits[,last.edit.to.page :=is.na(time.till.page.edit)]
384
385     all.edits[last.edit.to.page == TRUE,time.till.page.edit := time.last.edit.to.wiki-date.time]
386
387     all.edits <- all.edits[,time.till.page.edit := log1p(as.numeric(time.till.page.edit,units='days'))]
388
389     editor.variables <- all.edits[,
390                                   .(survives = any( (age > newcomer.period) & (age < newcomer.sunset)),anon=first(anon),is.bot=any(is.bot),is.admin=any(is.admin)),
391                                   by = .(wiki.name,editor)
392                                   ]
393     
394     first.session.edits <- all.edits[in.early.session==TRUE]
395     first.session.edits[,":="(end.newcomer.period = time.first.edit + newcomer.period)]
396
397     print("    aggregating newcomer activity within wikis")
398     newcomers <- first.session.edits[namespace == 0,
399                                 .(
400                                    is.reverted = any(reverted & reverted.by != editor),
401                                    p.reverted = first(p.reverted),
402                                    is.bot.reverted = any(reverted.by.bot),
403                                    is.admin.reverted = any(reverted.by.admin),
404                                    is.reverted.messaged = any(reverter.messages |
405                                                               reverter.talks,na.rm=TRUE),
406                                    reverter.talks = any(reverter.talks, na.rm=TRUE),
407                                    reverter.messages = any(reverter.messages, na.rm=TRUE),
408                                    editor.talks = any(editor.talks,na.rm=TRUE),
409                                    time.next.page.edit = min(time.till.next.edit, na.rm=TRUE),
410                                    BRD.initiation = any(editor.talks &
411                                                         (editor.talks.first |
412                                                          !reverter.talks), na.rm = TRUE),
413                                    
414                                    BRD.reciprocation = any(editor.talks &
415                                                            editor.talks.first &
416                                                            reverter.talks, na.rm = TRUE),
417                                    reverter.initates.BRD = any(reverter.talks & (!editor.talks.first |
418                                                                                  is.na(editor.talks.first)),na.rm=TRUE),
419                                    time.first.edit = first(time.first.edit),
420                                    time.till.page.edit = min(time.till.page.edit),
421                                    last.edit.to.page = all(last.edit.to.page),
422                                    end.newcomer.period = first(end.newcomer.period),
423                                    week = first(week),
424                                    year = first(year(time.first.edit)),
425                                    newcomer.edits = .N,
426                                    session.edits = first(n.first.session),
427                                    ns0.edits = sum(namespace == 0),
428                                    ns1.edits = sum(namespace == 1),
429                                    ns4.edits = sum(namespace == 4),
430                                    newcomer.chars.change = sum(chars.change),
431                                    newcomer.creates.article = any(creates.article),
432                                    wiki.type = first(wiki.type),
433                                    wiki.age = first(wiki.age)
434                                    ),
435                                 by = .(wiki.name, editor)
436                                 ]
437
438
439     newcomers[editor.variables,":="(survives = survives,is.bot=is.bot,is.admin=is.admin), on=.(wiki.name,editor)]
440
441     newcomers <- newcomers[!is.bot & !is.admin]    
442     print("    done")
443     print("    identifying newcomer activity on other wikis")
444                                      
445     newcomer.prior.wikis <- first.session.edits[newcomers,
446                                            .(
447                                                editor = editor,
448                                                wiki.name = i.wiki.name,
449                                                other.wiki = x.wiki.name,
450                                                time.first.edit.this = i.time.first.edit,
451                                                time.first.edit.other = x.time.first.edit
452
453                                            ),
454                                            on=.(wiki.type,editor,time.first.edit < time.first.edit),
455                                            nomatch=0L,
456                                            allow.cartesian = TRUE
457                                            ]
458     
459     # using < time first edit should exlude edits to this wiki
460     newcomer.prior.wikis <- newcomer.prior.wikis[,.(n.edits.other = .N),
461                                                  by=.(editor,wiki.name,other.wiki)]
462
463     newcomer.prior.wikis <- newcomer.prior.wikis[,
464                                                  .(n.other.wikis = .N,
465                                                    n.edits.other = sum(n.edits.other)),
466                                                  by=.(wiki.name,editor)]
467
468     newcomer.prior.wikis <- newcomer.prior.wikis[newcomers,
469                                                  .(
470                                                      wiki.name=wiki.name,
471                                                      editor=editor,
472                                                      n.other.wikis = n.other.wikis,
473                                                      n.edits.other = n.edits.other,
474                                                      has.edited.other.wikis = (n.other.wikis > 0) & (!is.na(n.other.wikis))),
475                                                  on=.(wiki.name,editor),
476                                                  nomatch=NA]
477                                                  
478     newcomers <- newcomers[newcomer.prior.wikis,
479                            ":="(n.other.wikis = ifelse(is.na(i.n.other.wikis),0,i.n.other.wikis),
480                                 n.edits.other = ifelse(is.na(i.n.edits.other),0,i.n.edits.other),
481                                 has.edited.other.wikis = (i.n.other.wikis > 0) & (!is.na(i.n.other.wikis))),
482                            on=.(wiki.name, editor)
483                            ]
484
485     newcomers[,":="(has.edited.other.wikis = ifelse(is.na(has.edited.other.wikis),FALSE,has.edited.other.wikis),
486                     n.edits.other = ifelse(is.na(n.edits.other),0,n.edits.other),
487                     n.other.wikis = ifelse(is.na(n.other.wikis),0,n.other.wikis)
488                    )]
489
490     print("    done")
491     print("    identifying all messages")
492                     
493     user.talk.edits <- all.edits[namespace==3]
494
495     user.talk.edits[,user.talk:=title]
496     
497     newcomers[,user.talk:= as.factor(paste0("User talk:",as.character(editor)))]
498
499     newcomer.messages <- user.talk.edits[newcomers,
500                                         .(
501                                           editor = i.editor,
502                                           n.messages = .N,
503                                           end.newcomer.period = i.end.newcomer.period
504                                           ),
505                                         on=.(wiki.name,user.talk,date.time <= end.newcomer.period),
506                                         by=.EACHI,
507                                        nomatch=0L]
508     
509     newcomer.messages <- newcomer.messages[newcomers,
510                                    .(wiki.name,
511                                      editor,
512                                      n.messages = x.n.messages,
513                    is.messaged = (x.n.messages > 0) & (!is.na(x.n.messages))),
514               on=.(wiki.name,editor),
515               nomatch = NA]
516
517     newcomers <- newcomers[newcomer.messages,
518                            ":="(n.messages = ifelse(is.na(i.n.messages),0L,i.n.messages),
519                                 is.messaged = ifelse(is.na(i.n.messages),FALSE,i.is.messaged)),
520                            on=.(wiki.name,editor)]
521
522     last.edit <- max(all.edits$date.time)
523     last.wikia.edit <- max(all.edits[wiki.type=="wikia",date.time])
524     newcomers <- newcomers[time.first.edit < last.edit - as.difftime(60,units="days")]
525     newcomers <- newcomers[(wiki.type == "wikia") & (time.first.edit < (last.wikia.edit - as.difftime(60,units="days")))]
526     
527     print("    done")
528     return(newcomers)
529 }
530
531
532 build.namespace4.dataset <- function(all.edits,  week.length = as.difftime(7,units="days")){
533     ns4.reg.edits <- all.edits[(namespace==4) & (anon==FALSE)]
534     
535     return(ns4.reg.edits)    
536 }
537     
538
539 build.wiki.level.variables <- function(all.edits, week.length = as.difftime(7,units="days")){
540
541     wiki.data <- all.edits[,.(n.editors = length(unique(editor)),
542                               total.wiki.length=last(total.wiki.length)
543                               )
544                            ,by=.(wiki.name,week)]
545     
546     wiki.ns4.data <- all.edits[namespace==4,
547                                .(n.ns4.edits = .N,
548                                  n.ns4.editors = length(unique(editor)),
549                                  d.ns4.length = sum(chars.change),
550                                  ns4.editor.age = mean(age)
551                                  ),
552                                by=.(wiki.name, week)]
553     
554     wiki.ns0.data <- all.edits[namespace==0,
555                                .(revert.rate = mean(reverted,na.rm=TRUE),
556                                  newcomer.revert.rate = sum((reverted & is.newcomer),na.rm=TRUE)/sum(is.newcomer,na.rm=TRUE),
557                                  revert.disc.rate = sum((reverted  & reverter.talks),na.rm=TRUE)/sum(reverted,na.rm=TRUE),
558                                  newcomer.revert.disc.rate = sum((reverted & reverter.talks & is.newcomer),na.rm=TRUE)/ sum(reverted & is.newcomer,na.rm=TRUE),
559                                  revert.message.rate = sum((reverted & reverter.messages),na.rm=TRUE)/sum(reverted,na.rm=TRUE),
560                                  newcomer.revert.message.rate = sum((reverted & reverter.messages & is.newcomer),na.rm=TRUE)/sum((reverted & is.newcomer),na.rm=TRUE),
561                                  newcomer.edits.rate = mean(is.newcomer,na.rm=TRUE),
562                                  bot.revert.rate = mean(reverted.by.bot,na.rm=TRUE),
563                                  bot.revert.prop = sum(reverted.by.bot,na.rm=TRUE)/sum(reverted,na.rm=TRUE),
564                                  newcomer.bot.revert.rate = mean((reverted.by.bot & is.newcomer),na.rm=TRUE), 
565                                  newcomer.bot.revert.prop = sum((reverted.by.bot & is.newcomer),na.rm=TRUE)/sum((reverted & is.newcomer),na.rm=TRUE),
566                                  admin.revert.rate = mean(reverted.by.admin,na.rm=TRUE),
567                                  admin.revert.prop = sum(reverted.by.admin,na.rm=TRUE)/sum(reverted,na.rm=TRUE),
568                                year = year(first(date.time)),
569                                month = month(first(date.time))),
570                                by=.(wiki.name,week)]
571
572     ## replace NAs with 0
573     wiki.ns0.data[,
574                   ":="(
575 #                      revert.rate = ifelse(is.na(revert.rate),0,revert.rate),
576                       revert.disc.rate = ifelse(is.na(revert.disc.rate),0,revert.disc.rate),
577                       newcomer.revert.disc.rate = ifelse(is.na(newcomer.revert.disc.rate),0,newcomer.revert.disc.rate),
578                       revert.message.rate = ifelse(is.na(revert.message.rate),0,revert.message.rate),
579                       newcomer.revert.message.rate = ifelse(is.na(newcomer.revert.message.rate),0,newcomer.revert.message.rate),
580                       newcomer.edits.rate = ifelse(is.na(newcomer.edits.rate),0,newcomer.edits.rate),
581                       bot.revert.rate = ifelse(is.na(bot.revert.rate),0,bot.revert.rate),
582                       bot.revert.prop = ifelse(is.na(bot.revert.prop),0,bot.revert.prop),
583                       newcomer.bot.revert.rate = ifelse(is.na(newcomer.bot.revert.rate),0,newcomer.bot.revert.rate),
584                       newcomer.bot.revert.prop = ifelse(is.na(newcomer.bot.revert.prop),0,newcomer.bot.revert.prop),
585                       admin.revert.rate = ifelse(is.na(admin.revert.rate),0,admin.revert.rate),
586                       admin.revert.prop = ifelse(is.na(admin.revert.prop),0,admin.revert.prop)),
587                   ]
588
589     ## bring it together
590     wiki.data[wiki.ns0.data,
591               ":="(
592                   revert.rate = i.revert.rate,
593                   revert.disc.rate = i.revert.disc.rate,
594                   newcomer.revert.disc.rate = i.newcomer.revert.disc.rate,
595                   revert.message.rate = i.revert.message.rate,
596                   newcomer.revert.message.rate = i.newcomer.revert.message.rate,
597                   newcomer.edits.rate = i.newcomer.edits.rate,
598                   bot.revert.rate = i.bot.revert.rate,
599                   bot.revert.prop = i.bot.revert.prop,
600                   newcomer.bot.revert.rate = i.newcomer.bot.revert.rate,
601                   newcomer.bot.revert.prop = i.newcomer.bot.revert.prop,
602                   admin.revert.rate = i.admin.revert.rate,
603                   admin.revert.prop = i.admin.revert.prop),
604               on=.(wiki.name,week)]
605
606     wiki.data[wiki.ns4.data,
607               ":="(
608                   n.ns4.edits = i.n.ns4.edits,
609                   n.ns4.editors = i.n.ns4.editors,
610                   d.ns4.length = i.d.ns4.length,
611                   ns4.editor.age = i.ns4.editor.age
612               ),
613               on=.(wiki.name,week)]
614     
615     # create variables for community size in standard deviation units
616     return(wiki.data)
617 }
618
619
620 load.all.edits <- function(){
621     if(!exists("all.edits")){
622         file.name <- "all.edits.RDS"
623         if(!file.exists(file.name)){
624             print("loading wikiq data")
625
626             all.edits <- build.newcomer.table.step1(wiki.list, newcomer.period = newcomer.period)
627
628             print("done")
629             
630             print("adding user role data")
631             all.edits <- add.userroles(all.edits,bots=bots,admins=admins)
632             print("done")
633
634             print("identifying reverts and messages")
635             all.edits <- identify.revert.messages(all.edits,week.length=as.difftime(7,units="days"))
636             print("done")
637             if(!nosave){
638                 print("saving work")
639                 saveRDS(all.edits,file.name)
640                 print("done")
641             }
642         } else{
643             print("loading wikiq data with reverts and messages")
644             all.edits <- readRDS(file.name)
645             print("done")
646         }
647
648         remember(min(all.edits$date.time),"earliest.data.point")
649         remember(max(all.edits$date.time),"latest.data.point")
650
651         ## make all.edits a global variable
652         all.edits <<- all.edits
653     }
654 }
655
656 newcomer.period = duration(2*30,unit="days")
657 newcomer.sunset = duration(30*6,unit="days")
658 week.length=duration(7,unit="days")
659 remember(newcomer.period)
660 remember(newcomer.sunset)
661 remember(week.length)
662
663 ## try loading newcomers
664
665 if(!exists("newcomers")){
666     file.name2 <- "newcomers.RDS"
667     if(file.exists(file.name2)){
668         newcomers <- readRDS(file.name2)            
669     } else{
670         print("building newcomers table")
671         load.all.edits()
672         
673         newcomers <- build.newcomers(all.edits,
674                                      newcomer.sunset = newcomer.sunset,
675                                      newcomer.period=newcomer.period)
676
677         print("done")
678         print("saving work")
679         if(!nosave){
680             saveRDS(newcomers,file.name2)
681         }
682     }
683 }    
684
685
686 if(!exists("ns4.reg.edits")){
687     file.name <- "ns4.reg.edits.RDS"
688     if(file.exists(file.name)){
689         ns4.reg.edits <- readRDS(file.name)            
690     } else{
691         print("building ns4 edits table")
692         
693         ## create table of namespace 4 edits from all edits
694         load.all.edits()
695         ns4.reg.edits <- build.namespace4.dataset(all.edits)
696         print("done")
697         print("saving work")
698         if(!nosave){
699             saveRDS(ns4.reg.edits,file.name)
700         }
701     }
702 }    
703
704 if(!exists("wiki.data")){
705     file.name3 <- "wikiweeks.RDS"
706     if(!file.exists(file.name3)){
707         print("building wiki level variable")
708         load.all.edits()
709         wiki.data <- build.wiki.level.variables(all.edits, week.length=week.length)
710         print("done")
711         print("saving work")
712         if(!nosave){
713             saveRDS(wiki.data,file.name3)
714         }
715         print("done")
716     }
717     else{
718         wiki.data <- readRDS(file.name3)
719     }
720 }
721
722 #wikis.to.remove <- newcomers[,.N,by="wiki.name"][N<30]$wiki.name
723 #remember(nrow(wikis.to.remove),"n.wikis.insufficient.newcomers")
724 #newcomers <- newcomers[!(wiki.name  %in% wikis.to.remove)]
725 #all.edits <- all.edits[!(wiki.name %in% wikis.to.remove)]
726 if(!exists("wiki.stats")){
727     file.name <- "wiki.stats.RDS"
728     if(!file.exists(file.name)){
729         load.all.edits()
730
731         editor.tenures <- all.edits[,.(tenure=first(editor.tenure)),by=.(wiki.name,editor)]
732         wiki.stats <- all.edits[,.(total.editors = length(unique(editor)),
733                                    total.edits = .N,
734                                    total.reverts = sum(reverted),
735                                    total.bot.reverts = sum(reverted.by.bot,na.rm=TRUE),
736                                    total.ns4.edits = nrow(.SD[namespace==4]),
737                                    med.edit.tenure = median(editor.tenure)
738                                    ),by=.(wiki.name)]
739
740         med.editor.tenure <- editor.tenures[,.(med.editor.tenure=median(tenure)),by=.(wiki.name)]
741
742         wiki.stats[med.editor.tenure,med.tenure := med.editor.tenure,on="wiki.name"]
743         newcomer.stats <- newcomers[,.(retention.rate = mean(survives),
744                                        reverted.newcomers = sum(is.reverted)
745                                        ),by=.(wiki.name)]
746         wiki.stats <- wiki.stats[newcomer.stats,':='(retention.rate = retention.rate, reverted.newcomers = reverted.newcomers), on="wiki.name"]
747         remember(wiki.stats,silent=TRUE)
748         saveRDS(wiki.stats,file.name)
749     } else {
750         wiki.stats <- readRDS("wiki.stats.RDS")
751     }
752 }
753
754 row1 <- c("total.editors","total.reverts","total.bot.reverts","total.ns4.edits")
755 row2 <- c("med.editor.tenure","retention.rate")
756 m.wiki.stats <- melt(wiki.stats,id='wiki.name',measure.vars = c("total.editors","total.reverts","total.bot.reverts","total.ns4.edits"))
757 m.wiki.stats[variable %in% row1, ":="(row = 1,col=which(row1 == variable,useNames=F)),by=variable]
758 m.wiki.stats[variable %in% row2, ":="(row = 2,col=which(row2 == variable,useNames=F)),by=variable]
759
760 m.wiki.stats <- m.wiki.stats[value != 0 | variable != "total.bot.reverts"]
761 m.wiki.stats <- m.wiki.stats[value == 0 & variable != "total.bot.reverts", value := 1]
762
763 friendly.var <- function(varname){
764     sapply(as.character(varname),function(f) switch(f,
765                                                     total.editors='Editors',
766                                                     total.reverts='Reverts',
767                                                     total.bot.reverts='Bot reverts',
768                                                     total.ns4.edits='Edits to the project namespace'))
769 }
770
771 var.id <- function(varname){
772     sapply(as.character(varname),function(f) switch(f,
773                                                     total.editors=1,
774                                                     total.reverts=2,
775                                                     total.bot.reverts=3,
776                                                     total.ns4.edits=4))
777
778
779 med.line.width <- 1
780 m.wiki.stats[,variable := friendly.var(variable)]
781 m.wiki.stats <- m.wiki.stats[,variable:=factor(variable,levels=c('Editors',"Reverts","Bot reverts","Edits to the project namespace"))]
782
783 spoke.data <- m.wiki.stats[,.(y = median(value)),by=variable]
784 remember(m.wiki.stats)
785 remember(spoke.data)
786 remember(nrow(wiki.stats),"n.wikia.wikis")
787
788 ## join wiki-level variables with newcomer variables to get ready to model newcomer retention.
789 newcomers <- newcomers[wiki.data,
790           ":="(
791               wiki.name=i.wiki.name,
792               week = i.week,
793               n.editors = i.n.editors,
794               total.wiki.length = i.total.wiki.length,           
795               revert.rate = i.revert.rate,
796               revert.disc.rate = i.revert.disc.rate,            
797               newcomer.revert.disc.rate = i.newcomer.revert.disc.rate,
798               revert.message.rate = i.revert.message.rate,         
799               newcomer.revert.message.rate = i.newcomer.revert.message.rate,
800               newcomer.edits.rate = i.newcomer.edits.rate,         
801               bot.revert.rate = i.bot.revert.rate,
802               bot.revert.prop = i.bot.revert.prop,             
803               newcomer.bot.revert.rate = i.newcomer.bot.revert.rate,
804               newcomer.bot.revert.prop = i.newcomer.bot.revert.prop,    
805               admin.revert.rate = i.admin.revert.rate,
806               admin.revert.prop = i.admin.revert.prop,           
807               n.ns4.edits = i.n.ns4.edits,
808               n.ns4.editors = i.n.ns4.editors,               
809               d.ns4.length = i.d.ns4.length,
810               ns4.editor.age = i.ns4.editor.age,
811               wiki.age.weeks = as.double(wiki.age,units='days')/7,
812               wiki.age.months = floor(as.double(wiki.age,units='days')/30),
813               wiki.age.half.years = floor(as.double(wiki.age,units='years')*2),
814               wiki.age.years = floor(as.double(wiki.age,units='years')),
815               quarter = factor(floor_date(time.first.edit,unit="3 months"))
816           ),
817           on=.(wiki.name,week)
818           ]
819
820
821 survival.data <- newcomers[,.(wiki.name,
822                               week,
823                               survival.rate = mean(survives),
824                               n.newcomers = .N),
825                            by = .(wiki.name, week)]
826 wiki.data <- wiki.data[survival.data,
827           ":="(
828               survival.rate = survival.rate,
829               n.newcomers =  n.newcomers),
830           on = .(wiki.name,week)]
831
832 file.name <- "active.editors.RDS"
833 if(!file.exists(file.name)){
834     load.all.edits()
835     active.editors <- all.edits[,
836                                 .(N.edits=.N,
837                                   wiki.age.years=first(wiki.age.years)),
838                                 by=.(wiki.name,
839                                      editor,
840                                      wiki.age.months)]
841     saveRDS(active.editors, file.name)
842
843 } else {
844     active.editors <- readRDS(file.name)
845 }

Community Data Science Collective || Want to submit a patch?