# Community Data Science Collective R Utilities # # Copyright (c) 2010-2016 Benjamin Mako Hill and Aaron Shaw # mako@atdot.cc, aaronshaw@northwestern.edu ## functions to deal with namespace information ##################################################################### load.wikia.namespaces <- function () { # load namespace data wikia.namespaces <- read.delim("~/data/wikia_namespaces.tsv", stringsAsFactors=TRUE, header=FALSE) colnames(wikia.namespaces) <- c("wiki", "ns.num", "ns.string") wikia.namespaces$ns.num <- as.factor(wikia.namespaces$ns.num) return(wikia.namespaces) } # enwiki - move to barnstars directory # TODO: TEST load.enwiki.namespaces <- function(){ enwiki.ns.num <- c(-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 100, 101, 108, 109) names(enwiki.ns.num) <- c( "Media", "Special", "", "Talk", "User", "User talk", "Wikipedia", "Wikipedia talk","File", "File talk", "MediaWiki", "MediaWiki talk", "Template", "Template talk", "Help", "Help talk", "Category", "Category talk", "Portal", "Portal talk", "Book","Book talk") } # function to take a list of article titles and a wiki name and return # a list of numbered namespaces titles.to.ns.num <- function (page.titles, wiki) { # load wikia namespace data from disk if it does not exist if (!exists("wikia.namespaces")) { wikia.namespaces <- load.wikia.namespaces() } # page.titles <- d$title # DEBUG ns.df <- wikia.namespaces[wikia.namespaces$wiki == wiki, c("ns.num", "ns.string")] namespaces <- as.character(ns.df$ns.num) names(namespaces) <- ns.df$ns.string # drop the zero, we'll deal with it later namespaces <- namespaces [!namespaces == 0] # change underscores to spaces (necessary?) page.titles <- gsub('_', ' ', page.titles) page.ns <- rep("0", length(page.titles)) for (ns in names(namespaces)) { page.ns[grepl(paste('^', ns, ':', sep=""), page.titles)] <- namespaces[ns] } # return the list of namespaces as a factor return(as.factor(page.ns)) }