library(data.table) # import ngram data # note that the file is not pushed to repository, but is available on # hyak at: /com/users/jdfoote/css_chapter/ngram_table.csv # Top 100,000 ngrams (?) ngrams <- read.delim("processed_data/ngram_table.csv", sep=",", header=TRUE, stringsAsFactors=FALSE)[,-3] names(ngrams)[1] <- "eid" subjects <- read.delim("processed_data/abstracts.tsv", header=TRUE, stringsAsFactors=FALSE, sep="\t")[,c("eid", "first_ASJC_subject_area")] names(subjects)[2] <- "subject" # takes a couple of minutes: ngrams <- merge(ngrams, subjects, by="eid", all.x=TRUE) # only use ngrams that occur accross all (many?) subject areas subject.by.ngram <- tapply(ngrams$subject, ngrams$term, function(x) length(unique(x))) # summary(subject.by.ngram) # # library(txtplot) # txtdensity(log(subject.by.ngram)) # Note: # The median number of subject areas per term is five. We'll cut it # off at terms that occur across at least 30 subject areas. top.ngrams <- ngrams[ngrams$term %in% names(subject.by.ngram[subject.by.ngram > 30]),c("eid", "term")] rm(ngrams, subject.by.ngram, subjects) # convert to a wide format matrix of dichotomous variables library(reshape2) library(data.table) top.ngrams <- data.table(top.ngrams) setkey(top.ngrams, eid) top.ngrams[,vv:= TRUE] # took more than 20 minutes on hyak top.ngram.matrix <- dcast(top.ngrams, eid ~ term, length, value.var = "vv") rm(top.ngrams) save(top.ngram.matrix, file="processed_data/top.ngram.matrix.RData") #load("processed_data/top.ngram.matrix.RData")