library(data.table) library(Matrix) library(glmnet) library(xtable) library(methods) predict.list <- NULL if(!exists("top.ngram.matrix")){ load("processed_data/top.ngram.matrix.RData") } if(!exists("pred.descrip")){ load("paper/data/prediction_descriptives.RData") covars <- pred.descrip$covars } top.ngram.matrix <- data.table(top.ngram.matrix) setkey(top.ngram.matrix, eid) covars <- data.table(pred.descrip$covars) setkey(covars,eid) # restrict to the overlap of the two datasets covars <- covars[covars$eid %in% top.ngram.matrix$eid,] top.ngram.matrix <- top.ngram.matrix[top.ngram.matrix$eid %in% covars$eid,] # rename the cited column in case it doesn't appear names(covars)[names(covars) == 'cited'] <- 'cited.x' # then merge also to facilitate some manipulations below d <- merge(covars, top.ngram.matrix, by="eid", all=FALSE) # Note that this duplicates some column names so X gets appended in a # few cases. # construct model matrices x.controls <- sparse.model.matrix(cited.x ~ language.x + modal_country + month.x, data=d)[,-1] x.aff <- sparse.model.matrix(cited.x ~ affiliation, data=d)[,-1] x.subj <- sparse.model.matrix(cited.x ~ subject.x, data=d)[,-1] x.venue <- sparse.model.matrix(cited.x ~ source_title, data=d)[,-1] x.ngrams <- as.matrix(subset(top.ngram.matrix, select=-eid)) x.ngrams <- as(x.ngrams, "sparseMatrix") X <- cBind(x.controls, covars$year.x, covars$works.cited) X.aff <- cBind(X, x.aff) X.subj <- cBind(X.aff, x.subj) X.venue <- cBind(X.subj, x.venue) X.terms <- cBind(X.venue, x.ngrams) Y <- covars$cited ### Hold-back sample for testing model performance later on: set.seed(20160719) holdback.index <- sample(nrow(X), round(nrow(X)*.1)) X.hold <- X[holdback.index,] X.hold.aff <- X.aff[holdback.index,] X.hold.subj <- X.subj[holdback.index,] X.hold.venue <- X.venue[holdback.index,] X.hold.terms <- X.terms[holdback.index,] Y.hold <- Y[holdback.index] X.test <- X[-holdback.index,] X.test.aff <- X.aff[-holdback.index,] X.test.subj <- X.subj[-holdback.index,] X.test.venue <- X.venue[-holdback.index,] X.test.terms <- X.terms[-holdback.index,] Y.test <- Y[-holdback.index] ############### Models and prediction set.seed(20160719) m.con <- cv.glmnet(X.test, Y.test, alpha=1, family="binomial", type.measure="class") con.pred = predict(m.con, type="class", s="lambda.min", newx=X.hold) m.aff <- cv.glmnet(X.test.aff, Y.test, alpha=1, family="binomial", type.measure="class") aff.pred = predict(m.aff, type="class", s="lambda.min", newx=X.hold.aff) m.subj <- cv.glmnet(X.test.subj, Y.test, alpha=1, family="binomial", type.measure="class") subj.pred = predict(m.subj, type="class", s="lambda.min", newx=X.hold.subj) m.venue <- cv.glmnet(X.test.venue, Y.test, alpha=1, family="binomial", type.measure="class") venue.pred = predict(m.venue, type="class", s="lambda.min", newx=X.hold.venue) m.terms <- cv.glmnet(X.test.terms, Y.test, alpha=1, family="binomial", type.measure="class") terms.pred = predict(m.terms, type="class", s="lambda.min", newx=X.hold.terms) ########## # Compare test set predictions against held-back sample: pred.df <- data.frame(cbind(con.pred, aff.pred, subj.pred, venue.pred, terms.pred)) names(pred.df) <- c("Controls", "+ Affiliation", "+ Subject", "+ Venue", "+ Terms") m.list <- list(m.con, m.aff, m.subj, m.venue, m.terms) # collect: # df # percent.deviance # nonzero coefficients # prediction error gen.m.summ.info <- function(model){ df <- round(tail(model$glmnet.fit$df, 1),0) percent.dev <- round(tail(model$glmnet.fit$dev.ratio, 1),2)*100 cv.error <- round(tail(model$cvm,1),2)*100 # null.dev <- round(tail(model$glmnet.fit$nulldev),0) out <- c(df, percent.dev, cv.error) return(out) } gen.class.err <- function(pred, test){ props <- prop.table(table(pred, test)) err.sum <- round(sum(props[1,2], props[2,1]),2)*100 return(err.sum) } results.tab <- cbind(names(pred.df),data.frame(matrix(unlist(lapply(m.list, gen.m.summ.info)), byrow=T, nrow=5))) results.tab$class.err <- sapply(pred.df, function(x) gen.class.err(x, Y.hold)) results.tab <- data.frame(lapply(results.tab, as.character)) names(results.tab) <- c("Model", "N features", "Deviance (%)", "CV error (%)", "Hold-back error (%)") print(xtable(results.tab, caption= "Summary of fitted models predicting any citations. The ``Model'' column describes which features were included. The N features column shows the number of features included in the prediction. ``Deviance'' summarizes the goodness of fit as a percentage of the total deviance accounted for by the model. ``CV error'' (cross-validation error) reports the prediction error rates of each model in the cross-validation procedure conducted as part of the parameter estimation process. ``Hold-back error'' shows the prediction error on a random 10 percent subset of the original dataset not included in any of the model estimation procedures.", label='tab:predict_models', align='llrrrr'), include.rownames=FALSE) # Store the results: predict.list$results.tab <- results.tab ############# Generate most salient coefficients nz.coefs <- data.frame( coef = colnames(X.test.terms)[which( coef(m.terms, s="lambda.min") != 0)], type = "term", beta = coef(m.terms, s="lambda.min")[which(coef(m.terms, s="lambda.min") != 0)]) nz.coefs$coef <- as.character(nz.coefs$coef) nz.coefs$type <- as.character(nz.coefs$type) nz.coefs <- nz.coefs[order(-abs(nz.coefs$beta)),] # comparison: #nz.coefs$type <- "terms" nz.coefs$type[grepl("(Intercept)", nz.coefs$coef)] <- NA nz.coefs$type[grepl("source_title", nz.coefs$coef)] <- "venue" nz.coefs$type[grepl("subject.x", nz.coefs$coef)] <- "subject" nz.coefs$type[grepl("affiliation", nz.coefs$coef)] <- "affiliation" nz.coefs$type[grepl("month.x", nz.coefs$coef)] <- "month" nz.coefs$type[grepl("modal_country", nz.coefs$coef)] <- "country" nz.coefs$type[grepl("language", nz.coefs$coef)] <- "language" nz.coefs$type[grepl("^20[0-9]{2}$", nz.coefs$coef)] <- "year" # cleanup nz.coefs$coef <- gsub("source_title", "", nz.coefs$coef) nz.coefs$coef <- gsub("subject.x", "", nz.coefs$coef) nz.coefs$coef <- gsub("affiliation","", nz.coefs$coef) nz.coefs$beta <- round(nz.coefs$beta, 3) names(nz.coefs) <- c("Feature", "Type", "Coefficient") predict.list$nz.coefs <- nz.coefs # table for all round(prop.table(table(nz.coefs$Type))*100, 2) # for top subsets round(prop.table(table(nz.coefs$Type[1:700]))*100, 2) round(prop.table(table(nz.coefs$Type[1:200]))*100, 2) round(prop.table(table(nz.coefs$Type[1:100]))*100, 2) print(xtable( as.matrix(head(nz.coefs, 10)), label='tab:nzcoefs', caption='Feature, variable type, and beta value for top 100 non-zero coefficients estimated by the best fitting model with all features included.', align='lllr' ), include.rownames=FALSE) # output save(predict.list, file="paper/data/prediction.RData")