# Library containing helper functions # Copyright (C) 2018 Nathan TeBlunthuis # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program. If not, see . library(parallel) library(urltools) library(data.table) library(texreg) ## load wikiq data for all wikis in the wiki list ## this wikiq data doesn't have persistent word revisions ## It doesn't collapse user edits either. we identify user sessions as well load.wikiq.file <- function(path){ d <- fread(paste0(path), colClasses=list(character=c("reverteds", "date_time", "editor", "title")), na.string="", stringsAsFactors=TRUE, quote="",drop=c("sha1","minor")) gc() setnames(d, gsub('_', '.', colnames(d))) setkey(d, "revid") d$date.time <- as.POSIXct(as.character(d$date.time), format="%Y-%m-%d %H:%M:%S", tz="UTC") d[, ':='(editor = as.factor(url_decode(as.character(editor))), title = as.factor(url_decode(as.character(title))))] d[d$editor == "127.0.0.1","anon"] <- FALSE # drop edits made before mediawiki was written d <- d[d$date.time > as.POSIXct("2002-01-22",timezone="UTC"),] ## drop wikia edits made after 2010-04-10, when data was collected if(wiki.list$wiki.type == "wikia"){ d <- d[d$date.time < as.POSIXct("2010-04-10",timezone="UTC"),] } # created "reverted" which captures whether an edit has been identity # reverted within the revert RADIUS (currently 15 edits). if (!any(d$revert)) { d$reverted <- FALSE ## we need to reorder the columns in this case ## the merge in the other case also reorders columns setcolorder(d,c("revid",names(d)[!grepl("revid",names(d))])) } else { reverteds <- d$reverteds[d$revert] if (!any(grepl(",", d$reverteds))) { reverteds <- unique(as.integer(as.character(d$reverteds))) } else { reverteds <- unique(as.integer(unlist(strsplit(as.character(reverteds), ",")))) } reverteds <- data.table(revid=reverteds, reverted=TRUE) d <- merge(d, reverteds, all.x=TRUE) d$reverted[is.na(d$reverted)] <- FALSE } # "new.id" indicates whether this is a first-time editor setkey(d, "date.time") d$new.account <- !duplicated(d$editor) d$new.account[is.na(d$editor)] <- FALSE d$total.edits <- length(d$revid) d$total.sessions <- seq(1, nrow(d)) d$total.editors <- cumsum(d$new.account) d$total.pages <- cumsum(!duplicated(d$articleid)) ## add the wiki name to the dt ## remove edits not in the namespaces we care about d <- d[namespace %in% c(0,1,3,4),] return(d) } load.wikiq.files <- function(i,wiki.list, path="wikiq_wikia_2010_all_nopersistence/"){ wiki.filename = wiki.list[i,filename] wiki <- wiki.list[i,wiki] print(wiki) d <- load.wikiq.file(paste0(path,wiki.filename)) d$wiki.name <- rep(wiki,nrow(d)) d$wiki.type <- rep(wiki.list[i,wiki.type],nrow(d)) d[,time.first.edit := min(date.time),by=.(editor.id, wiki.name)] return(d) } remember <- function (v, k, silent=FALSE) { if (!exists("r")){ rfilename = "remember.RDS" if(file.exists(rfilename)){ r <<- readRDS(rfilename) } else r <<- list() } if (missing(k)) { k <- deparse(substitute(v)) } ## save to the global r variable/list r[[k]] <<- v if (!silent) { print(r[[k]]) flush.console() } invisible(r[[k]]) ## return(r[[k]]) saveRDS(r,"remember.RDS") } ## make sure that appendix and nosave are always defined if (!exists("appendix")) { appendix <- FALSE } if (!exists("nosave")) { nosave <- FALSE } if(!exists("plot.distribtuions")){plot.distributions <- FALSE} basedir <- "." setwd(basedir) include.wikipedia <- FALSE if (!exists("wiki.list")) { subdir <- "userroles_data/" if (!exists(paste0(subdir,"missing.wikis"))){ deleted.wikis <- fread(paste0(subdir,"allusers_deleted_merge.txt"),header=FALSE,col.names=c("wiki")) deleted.wikis <- unique(deleted.wikis$wiki) notauthorized.wikis <- fread(paste0(subdir,"allusers_notauthorized_merge.txt"),header=FALSE,col.names=c("wiki")) notauthorized.wikis <- unique(notauthorized.wikis$wiki) missing.wikis = c(deleted.wikis, notauthorized.wikis) remember(deleted.wikis) remember(notauthorized.wikis) } wiki.list <- fread("selected.wikis.csv") wiki.list <- wiki.list[! (wiki %in% missing.wikis) ] wiki.list[wiki.type=="wikia",filename:=paste0(wiki,".tsv")] if(include.wikipedia){ matchidx <- wiki.list[wiki.type=="wikipedia",regexec("https://(.*)\\.wikipedia.org",url)] lang <- sapply(regmatches(wiki.list[wiki.type=="wikipedia",url],matchidx),function (l) l[2]) lang <- gsub("-","_",lang) wiki.list[wiki.type=="wikipedia",lang := lang] wiki.list[wiki.type=="wikipedia",filename:=paste0(lang,"_wikipedia.tsv")] } else{ wiki.list <- wiki.list[wiki.type != "wikipedia"] } # wiki.list[,lang := NULL] rm(missing.wikis) } if (!file.exists("wikis.used")){ write(wiki.list$wiki,"wikis.used") } options(mc.cores = 16)