RCommunityData/R/wikiq.R

   1 # Community Data Science Collective R Utilities
   2 #
   3 # Copyright (c) 2010-2016 Benjamin Mako Hill and Aaron Shaw
   4 # mako@atdot.cc, aaronshaw@northwestern.edu
   5
   6 # loads simple utility functions for use in the subsequent files
   7
   8 # store this for re-use across various scripts
   9 wikiq.header <- c("title", "articleid", "revid", "timestamp", "anon",
  10                   "editor", "editor_id", "minor", "text_size",
  11                   "text_entropy", "text_md5", "reversion",
  12                   "additions_size", "deletions_size", "edits",
  13                   "articles", "users")
  14
  15 # helper function to load the TSV files our perl scripts are generating
  16 load.extracted.df <- function (filename) {
  17   read.delim(filename, header=T, quote="", na.strings="", stringsAsFactors=TRUE)
  18 }
  19
  20 # helper function to grab the classes of all columns of a dataframe
  21 # keep this because it's being used but this can just be lapply(d, class)
  22 get.col.classes <- function (d) {
  23   sapply(colnames(d), function (col) { class(d[,col]) })
  24 }
  25
  26 # convert mediawiki timestamps into POSIXct
  27 timestamp.to.POSIXct <- function (ts.string)  {
  28   ts.string <- gsub("T", " ", ts.string)
  29   ts.string <- gsub("Z", "", ts.string)
  30   return(as.POSIXct(ts.string, format="%Y-%m-%d %H:%M:%S", tz="UTC"))
  31 }
  32
  33
  34 read.wikiq <- function (con, header=TRUE, detect.reverts=FALSE) {
  35   d <- read.delim(con, stringsAsFactors=FALSE, header=header,
  36                   encoding="UTF-8", quote="")
  37
  38   # rename date.time to timestamp and remove _
  39   colnames(d)[colnames(d) == "date.time"] <- "timestamp"
  40   colnames(d) <- sub("_", ".", colnames(d))
  41
  42   d$timestamp <- as.POSIXct(sub("^(.*)y(.*)\xc8zy$", "\\1\\2",
  43                                 d$timestamp), tz="UTC")
  44
  45   # convert reversion to a logical
  46   d$reversion <- !is.na(d$reversion)
  47
  48   if (detect.reverts) {
  49       # reorder so we cannow find the order and timestamp
  50       d <- d[order(d$title, d$timestamp),]
  51
  52       # generate a list of reverted editors and a list of previous and next md5
  53       d$reverted <- c(d$reversion[2:length(d$reversion)],NA)
  54       d$md5.next <- c(d$text.md5[2:length(d$reversion)],NA)
  55       d$md5.prev <- c(NA,d$text.md5[1:(length(d$reversion)-1)])
  56       d$reverted <- d$reverted & (d$md5.next == d$md5.prev)
  57
  58       # drop the extra columns and the last edit
  59       d <- d[!is.na(d$reverted),]
  60       d <- d[,!colnames(d) %in% c("md5.next", "md5.prev")]
  61
  62       # create a reverted by variable by shifting up the editors and
  63       # then NAing nonreverts
  64       d$reverted.by <- c(d$editor[2:length(d$reversion)], NA)
  65       d$reverted.by[!d$reverted] <- NA
  66   }
  67   # set ip address to the username and create a new variable
  68   d$ipaddress <- d$editor == ""
  69   d$editor[d$editor == ""] <- d$editor.id[d$editor == ""]
  70
  71   # delete the connection
  72   return(d)
  73 }
  74
  75 # TODO refactor this so that we clean the data BEFORE we read it into R
  76 # ATM, this is set to only work on 14 item issues
  77
  78 # see the vereins wiki for "Philcomputing" and 29 lines that seem to
  79 # have a newline in the editor name
  80 read.bz.wikiq <- function (filename, header=TRUE, detect.reverts=FALSE) {
  81   con <- pipe(paste("bzcat", filename, "|awk -F'\t' '{if (NF == 14) print;}'"))
  82   d <- read.wikiq(con, header=header, detect.reverts=detect.reverts)
  83   rm(con)
  84   return(d)
  85 }
  86