# Community Data Science Collective R Utilities # # Copyright (c) 2010-2016 Benjamin Mako Hill and Aaron Shaw # mako@atdot.cc, aaronshaw@northwestern.edu # loads simple utility functions for use in the subsequent files # store this for re-use across various scripts wikiq.header <- c("title", "articleid", "revid", "timestamp", "anon", "editor", "editor_id", "minor", "text_size", "text_entropy", "text_md5", "reversion", "additions_size", "deletions_size", "edits", "articles", "users") # helper function to load the TSV files our perl scripts are generating load.extracted.df <- function (filename) { read.delim(filename, header=T, quote="", na.strings="", stringsAsFactors=TRUE) } # helper function to grab the classes of all columns of a dataframe # keep this because it's being used but this can just be lapply(d, class) get.col.classes <- function (d) { sapply(colnames(d), function (col) { class(d[,col]) }) } # convert mediawiki timestamps into POSIXct timestamp.to.POSIXct <- function (ts.string) { ts.string <- gsub("T", " ", ts.string) ts.string <- gsub("Z", "", ts.string) return(as.POSIXct(ts.string, format="%Y-%m-%d %H:%M:%S", tz="UTC")) } read.wikiq <- function (con, header=TRUE, detect.reverts=FALSE) { d <- read.delim(con, stringsAsFactors=FALSE, header=header, encoding="UTF-8", quote="") # rename date.time to timestamp and remove _ colnames(d)[colnames(d) == "date.time"] <- "timestamp" colnames(d) <- sub("_", ".", colnames(d)) d$timestamp <- as.POSIXct(sub("^(.*)y(.*)\xc8zy$", "\\1\\2", d$timestamp), tz="UTC") # convert reversion to a logical d$reversion <- !is.na(d$reversion) if (detect.reverts) { # reorder so we cannow find the order and timestamp d <- d[order(d$title, d$timestamp),] # generate a list of reverted editors and a list of previous and next md5 d$reverted <- c(d$reversion[2:length(d$reversion)],NA) d$md5.next <- c(d$text.md5[2:length(d$reversion)],NA) d$md5.prev <- c(NA,d$text.md5[1:(length(d$reversion)-1)]) d$reverted <- d$reverted & (d$md5.next == d$md5.prev) # drop the extra columns and the last edit d <- d[!is.na(d$reverted),] d <- d[,!colnames(d) %in% c("md5.next", "md5.prev")] # create a reverted by variable by shifting up the editors and # then NAing nonreverts d$reverted.by <- c(d$editor[2:length(d$reversion)], NA) d$reverted.by[!d$reverted] <- NA } # set ip address to the username and create a new variable d$ipaddress <- d$editor == "" d$editor[d$editor == ""] <- d$editor.id[d$editor == ""] # delete the connection return(d) } # TODO refactor this so that we clean the data BEFORE we read it into R # ATM, this is set to only work on 14 item issues # see the vereins wiki for "Philcomputing" and 29 lines that seem to # have a newline in the editor name read.bz.wikiq <- function (filename, header=TRUE, detect.reverts=FALSE) { con <- pipe(paste("bzcat", filename, "|awk -F'\t' '{if (NF == 14) print;}'")) d <- read.wikiq(con, header=header, detect.reverts=detect.reverts) rm(con) return(d) }