]> code.communitydata.science - social-media-chapter.git/blob - code/prediction/02-build_textual_features.R
ignore rendered HTML
[social-media-chapter.git] / code / prediction / 02-build_textual_features.R
1 library(data.table)
2
3
4 # import ngram data
5 # note that the file is not pushed to repository, but is available on
6 # hyak at: /com/users/jdfoote/css_chapter/ngram_table.csv
7
8 # Top 100,000 ngrams (?)
9 ngrams <- read.delim("processed_data/ngram_table.csv", sep=",",
10                      header=TRUE, stringsAsFactors=FALSE)[,-3]
11 names(ngrams)[1] <- "eid"
12
13 subjects <- read.delim("processed_data/abstracts.tsv", header=TRUE,
14                          stringsAsFactors=FALSE, sep="\t")[,c("eid",
15                          "first_ASJC_subject_area")]
16 names(subjects)[2] <- "subject"
17
18 # takes a couple of minutes:
19 ngrams <- merge(ngrams, subjects, by="eid", all.x=TRUE)
20
21 # only use ngrams that occur accross all (many?) subject areas
22 subject.by.ngram <- tapply(ngrams$subject, ngrams$term, function(x)
23     length(unique(x)))
24
25 # summary(subject.by.ngram)
26 #
27 # library(txtplot)
28 # txtdensity(log(subject.by.ngram))
29
30 # Note:
31 # The median number of subject areas per term is five. We'll cut it
32 # off at terms that occur across at least 30 subject areas.
33
34 top.ngrams <- ngrams[ngrams$term %in%
35                      names(subject.by.ngram[subject.by.ngram >
36                      30]),c("eid", "term")]
37
38 rm(ngrams, subject.by.ngram, subjects)
39
40 # convert to a wide format matrix of dichotomous variables
41 library(reshape2)
42 library(data.table)
43
44 top.ngrams <- data.table(top.ngrams)
45 setkey(top.ngrams, eid)
46
47 top.ngrams[,vv:= TRUE]
48
49 # took more than 20 minutes on hyak
50 top.ngram.matrix <- dcast(top.ngrams, eid ~ term, length,
51                           value.var = "vv")
52
53 rm(top.ngrams)
54
55 save(top.ngram.matrix, file="processed_data/top.ngram.matrix.RData")
56 #load("processed_data/top.ngram.matrix.RData")

Community Data Science Collective || Want to submit a patch?