1 # Community Data Science Collective R Utilities
3 # Copyright (c) 2010-2016 Benjamin Mako Hill and Aaron Shaw
4 # mako@atdot.cc, aaronshaw@northwestern.edu
6 # privileges of interest:
7 # a shared variable that gets used everywhere
8 generate.admin.addrm <- function (logevents, current.admins) {
10 # convert types of a few variables
11 logevents$ancient <- logevents$ancient == "true"
12 logevents$timestamp <- timestamp.to.POSIXct(logevents$timestamp)
13 logevents$rights.new[is.na(logevents$rights.new)] <- ""
14 logevents$rights.old[is.na(logevents$rights.old)] <- ""
16 # TODO do wikia wikis have these =?
17 # in WP, all of these are negated by one day
18 logevents <- logevents[!(logevents$ancient & logevents$comment == "="),]
20 ##########################################
21 ### Parsing logevents file
22 #########################################
24 # separate out moderns & ancients and the necessary columns
25 ancients <- logevents[logevents$ancient,c("title","comment","timestamp")]
26 moderns <- logevents[!logevents$ancient,
27 c("title","rights.new","rights.old","timestamp")]
29 # function that looks at rights.old, rights.new and returns a value of
30 # privilege, add/remove, and timestamp for each user
31 parse.moderns <- function (i, d) {
32 user <- sub('^User:', "", d[i,"title"])
33 change.time <- d[i,"timestamp"]
34 rights.new <- d[i,"rights.new"]
35 rights.old <- d[i,"rights.old"]
37 # create a vector of new and old rights:
38 destring <- function (x) { strsplit(as.character(x), ", ")[[1]] }
40 # create a list of privileges that are mentioned
41 privileges <- unique(c(destring(rights.new),
42 destring(rights.old)))
44 # create T/F vectors incidating which privileges were added/removed
45 added <- privileges[privileges %in% destring(rights.new) &
46 !(privileges %in% destring(rights.old))]
47 removed <- privileges[!(privileges %in% destring(rights.new)) &
48 privileges %in% destring(rights.old)]
50 # assemble the data frame of: role,action,user,timestamp
51 data.frame(user=rep(user, length(c(added,removed))),
52 role=c(added, removed),
53 action=c(rep("added",length(added)),
54 rep("removed", length(removed))),
55 timestamp=rep(change.time, length(c(added,removed))),
56 era=rep("modern", length(c(added,removed))),
57 stringsAsFactors=FALSE)
60 # if there are log events, and there are non-ancients (not all are ancients), we parse them
61 if (dim(logevents)[1] & !all(logevents$ancient)) {
62 moderns.parsed <- do.call("rbind",
63 lapply(1:dim(moderns)[1], parse.moderns, moderns))
68 # another function to handle processing the ancients:
69 parse.ancient <- function (i, d) {
70 user <- sub('^.*?:', '', d[i,"title"])
71 comment <- d[i, "comment"]
72 change.time <- d[i, "timestamp"]
74 added <- unlist(strsplit(unlist(strsplit(comment, '(\\+|\\=)')), ', '))
76 # clean any leadin, trailing whitespace
77 added <- gsub("^\\s+|\\s+$", "", added)
82 timestamp=change.time,
84 stringsAsFactors=FALSE)
87 # if there are any ancients, we parse them
88 if (any(logevents$ancient)) {
89 ancients.parsed <- do.call("rbind",
90 lapply(1:dim(ancients)[1], parse.ancient, ancients))
92 ancients.parsed = NULL
95 combined <- rbind(moderns.parsed, ancients.parsed)
97 ##########################################
98 ### Parsing current.admins file
99 #########################################
100 # turn each of the columns after the first two into logical
102 # function to process pre.ancients
103 parse.current.admins <- function (i, d) {
104 user <- d[i, "username"]
105 roles <- gsub("^\\s+|\\s+$", "", strsplit(d[i, "groups"], ",")[[1]])
107 o <- data.frame(user=user, role=roles, stringsAsFactors=FALSE)
108 colnames(o) <- c("user", "role")
112 ## handle the case where there are no admins. This can happen on Wikipedia
113 if(dim(current.admins)[1] != 0){
114 current.admins.parsed <- do.call("rbind",
115 lapply(1:dim(current.admins)[1],
116 parse.current.admins, current.admins))
119 current.admins.parsed <- NULL
122 # select pre-ancients as people who have a given right *today* but
123 # were never seen as having it added
124 is.pre.ancients <- function (i, d, combined) {
128 # look to see if we've see any events with this user and role added:
129 # if we see none, this is pre-ancient
130 !any(combined$user == user &
131 combined$role == role &
132 combined$action == "added")
136 if(!is.null(current.admins.parsed)){
137 # create the list of pre-ancients (people role combinations we have
138 # not seen in the logevents data
139 pre.ancients <- current.admins.parsed[sapply(1:dim(current.admins.parsed)[1],
141 current.admins.parsed,
148 # make a list of people who have been removed
149 combined.removed <- combined[combined$action == "removed",]
150 if (!is.null(combined.removed)) {
151 if (dim(combined.removed)[1] > 0) {
152 combined.removed <- combined.removed[sapply(1:dim(combined.removed)[1],
156 timestamp <- d[i,"timestamp"]
158 # was the person added before they were removed? OR in the pre-ancients
159 any(combined$user == user &
160 combined$role == role &
161 combined$action == "added" &
162 combined$timestamp <= timestamp) | (user %in% pre.ancients$user)
163 }, combined.removed),c("user", "role")]
168 pre.ancients <- rbind(pre.ancients, combined.removed)
170 # give them the earliest ancient timestamp minus 1 day
171 # and then add the pre.ancients to the
172 if(!is.null(pre.ancients)){
173 pre.ancients$action <- "added"
174 pre.ancients$timestamp <- as.POSIXct("2000-01-01 00:00:00") # min(combined$timestamp) - 60 * 1440
175 pre.ancients$era <- "pre.ancient"
177 combined <- rbind(combined, pre.ancients)
180 # remove redunandt actions
181 combined <- combined[!duplicated(combined),]