5 # note that the file is not pushed to repository, but is available on
6 # hyak at: /com/users/jdfoote/css_chapter/ngram_table.csv
8 # Top 100,000 ngrams (?)
9 ngrams <- read.delim("processed_data/ngram_table.csv", sep=",",
10 header=TRUE, stringsAsFactors=FALSE)[,-3]
11 names(ngrams)[1] <- "eid"
13 subjects <- read.delim("processed_data/abstracts.tsv", header=TRUE,
14 stringsAsFactors=FALSE, sep="\t")[,c("eid",
15 "first_ASJC_subject_area")]
16 names(subjects)[2] <- "subject"
18 # takes a couple of minutes:
19 ngrams <- merge(ngrams, subjects, by="eid", all.x=TRUE)
21 # only use ngrams that occur accross all (many?) subject areas
22 subject.by.ngram <- tapply(ngrams$subject, ngrams$term, function(x)
25 # summary(subject.by.ngram)
28 # txtdensity(log(subject.by.ngram))
31 # The median number of subject areas per term is five. We'll cut it
32 # off at terms that occur across at least 30 subject areas.
34 top.ngrams <- ngrams[ngrams$term %in%
35 names(subject.by.ngram[subject.by.ngram >
36 30]),c("eid", "term")]
38 rm(ngrams, subject.by.ngram, subjects)
40 # convert to a wide format matrix of dichotomous variables
44 top.ngrams <- data.table(top.ngrams)
45 setkey(top.ngrams, eid)
47 top.ngrams[,vv:= TRUE]
49 # took more than 20 minutes on hyak
50 top.ngram.matrix <- dcast(top.ngrams, eid ~ term, length,
55 save(top.ngram.matrix, file="processed_data/top.ngram.matrix.RData")
56 #load("processed_data/top.ngram.matrix.RData")