source("code/prediction/utils.R") # use this to store things for use in the paper pred.descrip <- NULL abstracts <- read.delim("processed_data/abstracts.tsv", header=TRUE, stringsAsFactors=FALSE, sep="\t") abstracts <- subset(abstracts, select = -abstract) abstracts <- abstracts[abstracts$aggregation_type != "Trade Journal" & is.na(abstracts$aggregation_type) == FALSE, ] names(abstracts)[names(abstracts) == 'num_citations'] <- 'works_cited' abstracts$works_cited[is.na(abstracts$works_cited) == TRUE] <- 0 # affiliations affiliations <- read.delim("processed_data/paper_aff_table.tsv", header=TRUE, stringsAsFactors=FALSE, sep="\t") # eliminate missing values affiliations <- affiliations[!is.na(affiliations$affiliation_id) & affiliations$organization != "", ] remap.affiliations <- function(aff.id, aff.df = affiliations){ org.modal <- names(tail(sort(table(affiliations$organization[ affiliations$affiliation_id == aff.id])),1)) return(org.modal) } affiliations$organization <- sapply(affiliations$affiliation_id, remap.affiliations) affiliations <- subset(affiliations, select = c(paper_eid, organization)) names(affiliations) <- c("eid", "affiliation") # need to remove repeat affiliations affiliations <- affiliations[duplicated(affiliations$eid) == FALSE,] ###################################### d <- abstracts[, c("eid", "language", "modal_country", "source_title", "works_cited")] # dichotomous dependent variable d$cited <- abstracts$cited_by_count > 0 # store this here for use in the paper before we run any restrictions: pred.descrip$cited <- d$cited pred.descrip$cites <- abstracts$cited_by_count # We want these to be categorical variables d$modal_country <- factor(d$modal_country) d$language <- factor(d$language) d$subject <- factor(abstracts$first_ASJC_subject_area) d$source_title <- factor(d$source_title) d$month <- factor(strftime(abstracts$date, format= "%m")) # except for pub year - keep that continuous d$year <- as.numeric(strftime(abstracts$date, format="%Y")) # bring in org affiliations d <- merge(d, affiliations, by="eid") # note that this drops papers # w/out org info d$affiliation <- factor(d$affiliation) ##### Restrictions: ### do this explicitly so that changes are easy: d <- restrict(d, d$affiliation, 1) d <- restrict(d, d$subject, 1) d <- restrict(d, d$source_title, 1) d <- restrict(d, d$language, 1) d <- restrict(d, d$modal_country, 1) # n.authors # per author prior citations pred.descrip$covars <- d save(pred.descrip, file = "paper/data/prediction_descriptives.RData") rm(d, abstracts, affiliations)