code/prediction/02-build_textual_features.R

   1 library(data.table)
   2
   3
   4 # import ngram data
   5 # note that the file is not pushed to repository, but is available on
   6 # hyak at: /com/users/jdfoote/css_chapter/ngram_table.csv
   7
   8 # Top 100,000 ngrams (?)
   9 ngrams <- read.delim("processed_data/ngram_table.csv", sep=",",
  10                      header=TRUE, stringsAsFactors=FALSE)[,-3]
  11 names(ngrams)[1] <- "eid"
  12
  13 subjects <- read.delim("processed_data/abstracts.tsv", header=TRUE,
  14                          stringsAsFactors=FALSE, sep="\t")[,c("eid",
  15                          "first_ASJC_subject_area")]
  16 names(subjects)[2] <- "subject"
  17
  18 # takes a couple of minutes:
  19 ngrams <- merge(ngrams, subjects, by="eid", all.x=TRUE)
  20
  21 # only use ngrams that occur accross all (many?) subject areas
  22 subject.by.ngram <- tapply(ngrams$subject, ngrams$term, function(x)
  23     length(unique(x)))
  24
  25 # summary(subject.by.ngram)
  26 #
  27 # library(txtplot)
  28 # txtdensity(log(subject.by.ngram))
  29
  30 # Note:
  31 # The median number of subject areas per term is five. We'll cut it
  32 # off at terms that occur across at least 30 subject areas.
  33
  34 top.ngrams <- ngrams[ngrams$term %in%
  35                      names(subject.by.ngram[subject.by.ngram >
  36                      30]),c("eid", "term")]
  37
  38 rm(ngrams, subject.by.ngram, subjects)
  39
  40 # convert to a wide format matrix of dichotomous variables
  41 library(reshape2)
  42 library(data.table)
  43
  44 top.ngrams <- data.table(top.ngrams)
  45 setkey(top.ngrams, eid)
  46
  47 top.ngrams[,vv:= TRUE]
  48
  49 # took more than 20 minutes on hyak
  50 top.ngram.matrix <- dcast(top.ngrams, eid ~ term, length,
  51                           value.var = "vv")
  52
  53 rm(top.ngrams)
  54
  55 save(top.ngram.matrix, file="processed_data/top.ngram.matrix.RData")
  56 #load("processed_data/top.ngram.matrix.RData")