]> code.communitydata.science - social-media-chapter.git/blob - code/prediction/01-build_control_variables.R
ignore rendered HTML
[social-media-chapter.git] / code / prediction / 01-build_control_variables.R
1 source("code/prediction/utils.R")
2
3 # use this to store things for use in the paper
4 pred.descrip <- NULL
5
6 abstracts <- read.delim("processed_data/abstracts.tsv", header=TRUE,
7                 stringsAsFactors=FALSE, sep="\t")
8
9 abstracts <- subset(abstracts, select = -abstract)
10
11 abstracts <- abstracts[abstracts$aggregation_type != "Trade Journal" &
12                      is.na(abstracts$aggregation_type) == FALSE, ]
13
14 names(abstracts)[names(abstracts) == 'num_citations'] <- 'works_cited'
15 abstracts$works_cited[is.na(abstracts$works_cited) == TRUE] <- 0
16
17 # affiliations
18 affiliations <- read.delim("processed_data/paper_aff_table.tsv",
19                            header=TRUE, stringsAsFactors=FALSE,
20                            sep="\t")
21
22 # eliminate missing values
23 affiliations <- affiliations[!is.na(affiliations$affiliation_id) &
24                              affiliations$organization != "", ]
25
26
27 remap.affiliations <- function(aff.id,
28                                aff.df = affiliations){
29     org.modal <- names(tail(sort(table(affiliations$organization[
30         affiliations$affiliation_id == aff.id])),1))
31     return(org.modal)
32 }
33
34 affiliations$organization <- sapply(affiliations$affiliation_id, remap.affiliations)
35
36 affiliations <- subset(affiliations, select = c(paper_eid,
37                            organization))
38 names(affiliations) <- c("eid", "affiliation")
39
40 # need to remove repeat affiliations
41 affiliations <- affiliations[duplicated(affiliations$eid) == FALSE,]
42
43
44 ######################################
45 d <- abstracts[, c("eid", "language", "modal_country",
46                    "source_title", "works_cited")]
47
48 # dichotomous dependent variable
49 d$cited <- abstracts$cited_by_count > 0
50
51
52 # store this here for use in the paper before we run any restrictions: 
53 pred.descrip$cited <- d$cited
54 pred.descrip$cites <- abstracts$cited_by_count
55
56
57 # We want these to be categorical variables
58 d$modal_country <- factor(d$modal_country)
59 d$language <- factor(d$language)
60 d$subject <- factor(abstracts$first_ASJC_subject_area)
61 d$source_title <- factor(d$source_title)
62 d$month <- factor(strftime(abstracts$date, format= "%m"))
63 # except for pub year - keep that continuous
64 d$year <- as.numeric(strftime(abstracts$date, format="%Y"))
65
66 # bring in org affiliations
67 d <- merge(d, affiliations, by="eid") # note that this drops papers
68                                       # w/out org info
69
70 d$affiliation <- factor(d$affiliation)
71
72 ##### Restrictions:
73
74 ### do this explicitly so that changes are easy:
75 d <- restrict(d, d$affiliation, 1)
76 d <- restrict(d, d$subject, 1)
77 d <- restrict(d, d$source_title, 1)
78 d <- restrict(d, d$language, 1)
79 d <- restrict(d, d$modal_country, 1)
80
81 # n.authors
82 # per author prior citations
83
84 pred.descrip$covars <- d
85 save(pred.descrip, file = "paper/data/prediction_descriptives.RData")
86
87
88 rm(d, abstracts, affiliations)
89

Community Data Science Collective || Want to submit a patch?