# Community Data Science Collective R Utilities # # Copyright (c) 2010-2016 Benjamin Mako Hill and Aaron Shaw # mako@atdot.cc, aaronshaw@northwestern.edu # privileges of interest: # a shared variable that gets used everywhere generate.admin.addrm <- function (logevents, current.admins) { # convert types of a few variables logevents$ancient <- logevents$ancient == "true" logevents$timestamp <- timestamp.to.POSIXct(logevents$timestamp) logevents$rights.new[is.na(logevents$rights.new)] <- "" logevents$rights.old[is.na(logevents$rights.old)] <- "" # TODO do wikia wikis have these =? # in WP, all of these are negated by one day logevents <- logevents[!(logevents$ancient & logevents$comment == "="),] ########################################## ### Parsing logevents file ######################################### # separate out moderns & ancients and the necessary columns ancients <- logevents[logevents$ancient,c("title","comment","timestamp")] moderns <- logevents[!logevents$ancient, c("title","rights.new","rights.old","timestamp")] # function that looks at rights.old, rights.new and returns a value of # privilege, add/remove, and timestamp for each user parse.moderns <- function (i, d) { user <- sub('^User:', "", d[i,"title"]) change.time <- d[i,"timestamp"] rights.new <- d[i,"rights.new"] rights.old <- d[i,"rights.old"] # create a vector of new and old rights: destring <- function (x) { strsplit(as.character(x), ", ")[[1]] } # create a list of privileges that are mentioned privileges <- unique(c(destring(rights.new), destring(rights.old))) # create T/F vectors incidating which privileges were added/removed added <- privileges[privileges %in% destring(rights.new) & !(privileges %in% destring(rights.old))] removed <- privileges[!(privileges %in% destring(rights.new)) & privileges %in% destring(rights.old)] # assemble the data frame of: role,action,user,timestamp data.frame(user=rep(user, length(c(added,removed))), role=c(added, removed), action=c(rep("added",length(added)), rep("removed", length(removed))), timestamp=rep(change.time, length(c(added,removed))), era=rep("modern", length(c(added,removed))), stringsAsFactors=FALSE) } # if there are log events, and there are non-ancients (not all are ancients), we parse them if (dim(logevents)[1] & !all(logevents$ancient)) { moderns.parsed <- do.call("rbind", lapply(1:dim(moderns)[1], parse.moderns, moderns)) } else { moderns.parsed = NULL } # another function to handle processing the ancients: parse.ancient <- function (i, d) { user <- sub('^.*?:', '', d[i,"title"]) comment <- d[i, "comment"] change.time <- d[i, "timestamp"] added <- unlist(strsplit(unlist(strsplit(comment, '(\\+|\\=)')), ', ')) # clean any leadin, trailing whitespace added <- gsub("^\\s+|\\s+$", "", added) data.frame(user=user, role=added, action="added", timestamp=change.time, era="ancient", stringsAsFactors=FALSE) } # if there are any ancients, we parse them if (any(logevents$ancient)) { ancients.parsed <- do.call("rbind", lapply(1:dim(ancients)[1], parse.ancient, ancients)) } else { ancients.parsed = NULL } combined <- rbind(moderns.parsed, ancients.parsed) ########################################## ### Parsing current.admins file ######################################### # turn each of the columns after the first two into logical # function to process pre.ancients parse.current.admins <- function (i, d) { user <- d[i, "username"] roles <- gsub("^\\s+|\\s+$", "", strsplit(d[i, "groups"], ",")[[1]]) o <- data.frame(user=user, role=roles, stringsAsFactors=FALSE) colnames(o) <- c("user", "role") return(o) } ## handle the case where there are no admins. This can happen on Wikipedia if(dim(current.admins)[1] != 0){ current.admins.parsed <- do.call("rbind", lapply(1:dim(current.admins)[1], parse.current.admins, current.admins)) } else{ current.admins.parsed <- NULL } # select pre-ancients as people who have a given right *today* but # were never seen as having it added is.pre.ancients <- function (i, d, combined) { user <- d[i, "user"] role <- d[i, "role"] # look to see if we've see any events with this user and role added: # if we see none, this is pre-ancient !any(combined$user == user & combined$role == role & combined$action == "added") } if(!is.null(current.admins.parsed)){ # create the list of pre-ancients (people role combinations we have # not seen in the logevents data pre.ancients <- current.admins.parsed[sapply(1:dim(current.admins.parsed)[1], is.pre.ancients, current.admins.parsed, combined),] } else{ pre.ancients <- NULL } # make a list of people who have been removed combined.removed <- combined[combined$action == "removed",] if (!is.null(combined.removed)) { if (dim(combined.removed)[1] > 0) { combined.removed <- combined.removed[sapply(1:dim(combined.removed)[1], function (i,d) { user <- d[i,"user"] role <- d[i,"role"] timestamp <- d[i,"timestamp"] # was the person added before they were removed? OR in the pre-ancients any(combined$user == user & combined$role == role & combined$action == "added" & combined$timestamp <= timestamp) | (user %in% pre.ancients$user) }, combined.removed),c("user", "role")] } } pre.ancients <- rbind(pre.ancients, combined.removed) # give them the earliest ancient timestamp minus 1 day # and then add the pre.ancients to the if(!is.null(pre.ancients)){ pre.ancients$action <- "added" pre.ancients$timestamp <- as.POSIXct("2000-01-01 00:00:00") # min(combined$timestamp) - 60 * 1440 pre.ancients$era <- "pre.ancient" combined <- rbind(combined, pre.ancients) } # remove redunandt actions combined <- combined[!duplicated(combined),] return(combined) }