1 # Library containing helper functions
2 # Copyright (C) 2018 Nathan TeBlunthuis
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU General Public License for more details.
14 # You should have received a copy of the GNU General Public License
15 # along with this program. If not, see <https://www.gnu.org/licenses/>.
22 ## load wikiq data for all wikis in the wiki list
23 ## this wikiq data doesn't have persistent word revisions
24 ## It doesn't collapse user edits either. we identify user sessions as well
25 load.wikiq.file <- function(path){
26 d <- fread(paste0(path),
27 colClasses=list(character=c("reverteds", "date_time", "editor", "title")),
28 na.string="", stringsAsFactors=TRUE, quote="",drop=c("sha1","minor"))
30 setnames(d, gsub('_', '.', colnames(d)))
32 d$date.time <- as.POSIXct(as.character(d$date.time),
33 format="%Y-%m-%d %H:%M:%S",
36 d[, ':='(editor = as.factor(url_decode(as.character(editor))), title = as.factor(url_decode(as.character(title))))]
38 d[d$editor == "127.0.0.1","anon"] <- FALSE
40 # drop edits made before mediawiki was written
41 d <- d[d$date.time > as.POSIXct("2002-01-22",timezone="UTC"),]
43 ## drop wikia edits made after 2010-04-10, when data was collected
44 if(wiki.list$wiki.type == "wikia"){
45 d <- d[d$date.time < as.POSIXct("2010-04-10",timezone="UTC"),]
48 # created "reverted" which captures whether an edit has been identity
49 # reverted within the revert RADIUS (currently 15 edits).
52 ## we need to reorder the columns in this case
53 ## the merge in the other case also reorders columns
54 setcolorder(d,c("revid",names(d)[!grepl("revid",names(d))]))
57 reverteds <- d$reverteds[d$revert]
59 if (!any(grepl(",", d$reverteds))) {
60 reverteds <- unique(as.integer(as.character(d$reverteds)))
62 reverteds <- unique(as.integer(unlist(strsplit(as.character(reverteds), ","))))
65 reverteds <- data.table(revid=reverteds, reverted=TRUE)
66 d <- merge(d, reverteds, all.x=TRUE)
67 d$reverted[is.na(d$reverted)] <- FALSE
70 # "new.id" indicates whether this is a first-time editor
71 setkey(d, "date.time")
72 d$new.account <- !duplicated(d$editor)
73 d$new.account[is.na(d$editor)] <- FALSE
74 d$total.edits <- length(d$revid)
75 d$total.sessions <- seq(1, nrow(d))
76 d$total.editors <- cumsum(d$new.account)
77 d$total.pages <- cumsum(!duplicated(d$articleid))
79 ## add the wiki name to the dt
81 ## remove edits not in the namespaces we care about
82 d <- d[namespace %in% c(0,1,3,4),]
86 load.wikiq.files <- function(i,wiki.list, path="wikiq_wikia_2010_all_nopersistence/"){
87 wiki.filename = wiki.list[i,filename]
88 wiki <- wiki.list[i,wiki]
90 d <- load.wikiq.file(paste0(path,wiki.filename))
92 d$wiki.name <- rep(wiki,nrow(d))
93 d$wiki.type <- rep(wiki.list[i,wiki.type],nrow(d))
94 d[,time.first.edit := min(date.time),by=.(editor.id, wiki.name)]
99 remember <- function (v, k, silent=FALSE) {
101 rfilename = "remember.RDS"
102 if(file.exists(rfilename)){
104 r <<- readRDS(rfilename)
111 k <- deparse(substitute(v))
114 ## save to the global r variable/list
125 saveRDS(r,"remember.RDS")
128 ## make sure that appendix and nosave are always defined
129 if (!exists("appendix")) { appendix <- FALSE }
130 if (!exists("nosave")) { nosave <- FALSE }
131 if(!exists("plot.distribtuions")){plot.distributions <- FALSE}
134 include.wikipedia <- FALSE
135 if (!exists("wiki.list")) {
136 subdir <- "userroles_data/"
137 if (!exists(paste0(subdir,"missing.wikis"))){
138 deleted.wikis <- fread(paste0(subdir,"allusers_deleted_merge.txt"),header=FALSE,col.names=c("wiki"))
139 deleted.wikis <- unique(deleted.wikis$wiki)
141 notauthorized.wikis <- fread(paste0(subdir,"allusers_notauthorized_merge.txt"),header=FALSE,col.names=c("wiki"))
142 notauthorized.wikis <- unique(notauthorized.wikis$wiki)
143 missing.wikis = c(deleted.wikis, notauthorized.wikis)
144 remember(deleted.wikis)
145 remember(notauthorized.wikis)
148 wiki.list <- fread("selected.wikis.csv")
149 wiki.list <- wiki.list[! (wiki %in% missing.wikis) ]
150 wiki.list[wiki.type=="wikia",filename:=paste0(wiki,".tsv")]
152 if(include.wikipedia){
153 matchidx <- wiki.list[wiki.type=="wikipedia",regexec("https://(.*)\\.wikipedia.org",url)]
154 lang <- sapply(regmatches(wiki.list[wiki.type=="wikipedia",url],matchidx),function (l) l[2])
155 lang <- gsub("-","_",lang)
156 wiki.list[wiki.type=="wikipedia",lang := lang]
157 wiki.list[wiki.type=="wikipedia",filename:=paste0(lang,"_wikipedia.tsv")]
160 wiki.list <- wiki.list[wiki.type != "wikipedia"]
163 # wiki.list[,lang := NULL]
168 if (!file.exists("wikis.used")){
169 write(wiki.list$wiki,"wikis.used")
172 options(mc.cores = 16)