]> code.communitydata.science - rises_declines_wikia_code.git/blob - 04_model_namespace4.R
add copy of the GPL
[rises_declines_wikia_code.git] / 04_model_namespace4.R
1 #!/usr/bin/env Rscript
2
3 # Fits models predicting reverions of namespace 4 edits
4 # Copyright (C) 2018  Nathan TeBlunthuis
5
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or
9 # (at your option) any later version.
10
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 # GNU General Public License for more details.
15
16 # You should have received a copy of the GNU General Public License
17 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
18
19 library(effects)
20 library(texreg)
21 library(lme4)
22 if(!exists("newcomers")){
23     source("01_build_datasets.R")
24 }
25 nosave <- FALSE
26 sample <- FALSE
27
28 if(sample == TRUE){
29     source("lib-01-sample-datasets.R")
30     ns4.ds <- sample.ns4.edits()
31     weights <- ns4.ds$weight
32 }else{
33     ns4.ds <- ns4.reg.edits
34 }
35
36
37 ns4.ds <- ns4.ds[,":="(wiki.age.log = log1p(as.double(wiki.age,units="years")),
38              age.log = log1p(as.double(age,units="years")),
39              wiki.age = as.double(wiki.age,units='years'),
40              quarter = as.factor(paste0(year(date.time),"_",ceiling(month(date.time)/4))),
41              age = as.double(age,units='years'))]
42
43 ns4.ds <- ns4.ds[,":="(time.first.wikia.edit = min(time.first.edit)),by=.(editor)]
44 ns4.ds.all.newcomers <- ns4.ds
45 ns4.ds <- ns4.ds[time.first.wikia.edit == time.first.edit]
46
47 ns4.summary.stats <- list()
48 ns4.summary.stats$p.reverted <- mean(ns4.ds$reverted)
49 ns4.summary.stats$var.reverted <- var(ns4.ds$reverted)
50 ns4.summary.stats$mean.editor.age <- mean(ns4.ds$age)
51 ns4.summary.stats$var.editor.age <- var(ns4.ds$age)
52 ns4.summary.stats$median.editor.age <- median(ns4.ds$age)
53 ns4.summary.stats$mean.wiki.age <- mean(ns4.ds$wiki.age)
54 ns4.summary.stats$var.wiki.age <- var(ns4.ds$wiki.age)
55 ns4.summary.stats$median.wiki.age <- median(ns4.ds$wiki.age)
56
57 remember(ns4.summary.stats)
58
59 print('fit morgan model')
60 f.morgan <- as.formula("reverted ~ age + wiki.age + quarter + wiki.name")
61 morgan.model <- glm(f.morgan,data=ns4.ds,family=binomial(link='logit'))
62 saveRDS(morgan.model,"morgan.model.RDS")
63 remember(extract(morgan.model),"morgan.model",silent=TRUE)
64
65 print('fit morgan model weights')
66 f.morgan <- as.formula("reverted ~ age + wiki.age + quarter + wiki.name")
67
68 n.total.wikis <- length(unique(ns4.ds$wiki.name))
69 weight.per.wiki <- nrow(ns4.ds)/n.total.wikis
70 ns4.ds <- ns4.ds[,weights := weight.per.wiki/.N, by=.(wiki.name)]
71 morgan.model.weighted <- glm(f.morgan,data=ns4.ds,family=binomial(link='logit'),weights=ns4.ds$weights)
72 saveRDS(morgan.model.weighted,"morgan.model.weighted.RDS")
73 remember(extract(morgan.model.weighted),"morgan.model.weighted",silent=TRUE)
74
75 print('fit morgan model weights')
76 f.morgan <- as.formula("reverted ~ age + wiki.age + quarter + wiki.name")
77 ns4.ds <- ns4.ds[,N:=.N,by=wiki.name]
78 ns4.ds.temp <- ns4.ds
79 min.edits <- 10
80 remember(print(1 - length(unique(ns4.ds[N>=min.edits]$wiki.name))/length(unique(ns4.ds$wiki.name))),"p.wikis.removed.weighted2")
81 # remove the bottom 24.1% of wikis
82 ns4.ds <- ns4.ds[N>=min.edits]
83 n.total.wikis <- length(unique(ns4.ds$wiki.name))
84 weight.per.wiki <- nrow(ns4.ds)/n.total.wikis
85 ns4.ds <- ns4.ds[,weights := weight.per.wiki/.N, by=.(wiki.name)]
86 morgan.model.weighted2 <- glm(f.morgan,data=ns4.ds,family=binomial(link='logit'),weights=ns4.ds$weights)
87 saveRDS(morgan.model.weighted2,"morgan.model.weighted2.RDS")
88 remember(extract(morgan.model.weighted2),"morgan.model.weighted2",silent=TRUE)
89 ns4.ds <- ns4.ds.temp
90
91 print('fit morgan model all newcomers')
92 morgan.model.all.newcomers <- glm(f.morgan,data=ns4.ds.all.newcomers,family=binomial(link='logit'))
93 saveRDS(morgan.model.all.newcomers,"morgan.model.all.newcomers.RDS")
94 remember(extract(morgan.model.all.newcomers),"morgan.model.all.newcomers",silent=TRUE)
95
96 print('fitting RE model')
97
98 re.icc.reverted.model <- glmer(as.formula("reverted ~ + (1 | wiki.name) -1 "),data=ns4.ds,family=binomial(link=logit))
99 saveRDS(re.icc.reverted.model,"re.icc.reverted.model.RDS")
100 varcorrmat <- as.data.table(VarCorr(re.icc.reverted.model))
101 wiki.var <- varcorrmat[grp=='wiki.name' & var1=="(Intercept)" ,vcov]
102 group.var <- var(residuals(re.icc.reverted.model))
103 icc <- wiki.var/(group.var + wiki.var)
104 remember(varcorrmat,'icc.reverted.varcorrmat')
105 remember(group.var,'icc.reverted.group.var')
106 remember(icc,'icc.reverted')    
107
108 ## print("fit morgan model sample")
109 ## sample.size <- 30
110 ## ns4.ds <- ns4.ds[,in.sample:=(.N >= sample.size),by=wiki.name]
111 ## # DT[,.SD[sample(.N, min(3,.N))],by = a]
112 ## ns4.ds.equal.sample <- ns4.ds[,.SD[sample(.N,min(sample.size,.N))], by=wiki.name]
113 ## morgan.model.sampled <- glm(f.morgan,data=ns4.ds.equal.sample,family=binomial(link='logit'))
114 ## saveRDS(morgan.model.sampled,"morgan.model.sampled.RDS")
115 ## remember(extract(morgan.model.sampled),"morgan.model.sampled",silent=TRUE)
116
117 ## ns4.model2.formula <- as.formula("reverted ~ age.log + wiki.age + quarter")
118 ## ns4.model2 <- glm(ns4.model2,data=ns4.ds,family=binomial(link='logit'),weights=weights)
119 ## remember(extract(ns4.model2),"ns4.model2")
120
121 ## print('fit morgan no pooling model')
122 ## f.morgan <- as.formula("reverted ~ age.log + wiki.age + quarter + wiki.name:age.log + wiki.name:wiki.age")
123 ## morgan.model <- glm(f.morgan,data=ns4.ds,family=binomial(link='logit'),weights=weights)
124 ## remember(extract(morgan.model),"morgan.model")
125
126 ## re.ns4.model <- glmer(as.formula("reverted ~ age.log + wiki.age + quarter | wiki.name"),data=ns4.ds,family=binomial(link='logit'),weights=weights)
127
128 ## remember(extract(re.ns4.model),'re.ns4.model')
129
130 ## print('fit morgan.robustness.1 model')
131 ## f.morgan.robustness.1 <- as.formula("reverted ~ age.log + wiki.age + quarter + wiki.name")
132 ## ns4.reg.edits.robustness <- build.namespace4.dataset(all.edits[p.reverted < 0.5])
133
134 ## ns4.reg.edits.robustness[,":="(wiki.age.log = log1p(as.double(wiki.age,units="weeks")),
135 ##                                age.log = log1p(as.double(age,units="weeks")),
136 ##                                wiki.age = as.double(wiki.age,units='weeks'),
137 ##                                quarter = as.factor(paste0(year(date.time),"_",ceiling(month(date.time)/4))))]
138
139 ## morgan.robustness.1.model <- glm(f.morgan.robustness.1,data=pns4.reg.edits.robustness,family=binomial(link='logit'),weights=weights)
140 ## saveRDS(morgan.robustness.1.model,"morgan.robustness.1.model.RDS")
141 ## remember(extract(morgan.robustness.1.model),"morgan.robustness.1.model")
142
143       
144 ## ns4.ds[,":="(wiki.age.log = log1p(as.numeric(wiki.age,units="weeks")), age.log = log1p(as.numeric(age,units="weeks")))]
145 ## f.ns4.2 <- as.formula("reverted ~ age.log + wiki.age.log + age.log|wiki.age.log + wiki.name")
146 ## ns4.2.model <- glm(f.morgan,data=ns4.ds,family=binomial(link='logit'),weights=weights)
147 ## remember(extract(ns4.2.model))
148
149 ## summary statistics for namespace 4 edits

Community Data Science Collective || Want to submit a patch?