library(data.table)
library(Matrix)
library(glmnet)
library(xtable)
library(methods)

predict.list <- NULL

if(!exists("top.ngram.matrix")){
    load("processed_data/top.ngram.matrix.RData")
}

if(!exists("pred.descrip")){
    load("paper/data/prediction_descriptives.RData")
    covars <- pred.descrip$covars
}

top.ngram.matrix <- data.table(top.ngram.matrix)
setkey(top.ngram.matrix, eid)
covars <- data.table(pred.descrip$covars)
setkey(covars,eid)

# restrict to the overlap of the two datasets
covars <- covars[covars$eid %in% top.ngram.matrix$eid,]

top.ngram.matrix <- top.ngram.matrix[top.ngram.matrix$eid %in%
                                     covars$eid,]

# rename the cited column in case it doesn't appear
names(covars)[names(covars) == 'cited'] <- 'cited.x'

# then merge also to facilitate some manipulations below
d <- merge(covars, top.ngram.matrix, by="eid", all=FALSE)

# Note that this duplicates some column names so X gets appended in a
# few cases.

# construct model matrices
x.controls <- sparse.model.matrix(cited.x ~ language.x +
                                    modal_country + month.x,
                                   data=d)[,-1]

x.aff <- sparse.model.matrix(cited.x ~ affiliation, data=d)[,-1]
x.subj <- sparse.model.matrix(cited.x ~ subject.x, data=d)[,-1]
x.venue <- sparse.model.matrix(cited.x ~ source_title, data=d)[,-1]

x.ngrams <- as.matrix(subset(top.ngram.matrix, select=-eid))
x.ngrams <- as(x.ngrams, "sparseMatrix")

X <- cBind(x.controls, covars$year.x, covars$works.cited)
X.aff <- cBind(X, x.aff)
X.subj <- cBind(X.aff, x.subj)
X.venue <- cBind(X.subj, x.venue)
X.terms <- cBind(X.venue, x.ngrams)

Y <- covars$cited

### Hold-back sample for testing model performance later on:
set.seed(20160719)
holdback.index <- sample(nrow(X), round(nrow(X)*.1))

X.hold <- X[holdback.index,]
X.hold.aff <- X.aff[holdback.index,]
X.hold.subj <- X.subj[holdback.index,]
X.hold.venue <- X.venue[holdback.index,]
X.hold.terms <- X.terms[holdback.index,]
Y.hold <- Y[holdback.index]

X.test <- X[-holdback.index,]
X.test.aff <- X.aff[-holdback.index,]
X.test.subj <- X.subj[-holdback.index,]
X.test.venue <- X.venue[-holdback.index,]
X.test.terms <- X.terms[-holdback.index,]
Y.test <- Y[-holdback.index]

###############  Models and prediction

set.seed(20160719)

m.con <- cv.glmnet(X.test, Y.test, alpha=1, family="binomial",
                    type.measure="class")
con.pred = predict(m.con, type="class", s="lambda.min",
                    newx=X.hold)

m.aff <- cv.glmnet(X.test.aff, Y.test, alpha=1, family="binomial",
                    type.measure="class")
aff.pred = predict(m.aff, type="class", s="lambda.min",
                    newx=X.hold.aff)

m.subj <- cv.glmnet(X.test.subj, Y.test, alpha=1, family="binomial",
                    type.measure="class")
subj.pred = predict(m.subj, type="class", s="lambda.min",
                    newx=X.hold.subj)

m.venue <- cv.glmnet(X.test.venue, Y.test, alpha=1, family="binomial",
                    type.measure="class")
venue.pred = predict(m.venue, type="class", s="lambda.min",
                    newx=X.hold.venue)

m.terms <- cv.glmnet(X.test.terms, Y.test, alpha=1, family="binomial",
                    type.measure="class")
terms.pred = predict(m.terms, type="class", s="lambda.min",
                    newx=X.hold.terms)

##########
# Compare test set predictions against held-back sample:

pred.df <- data.frame(cbind(con.pred, aff.pred, subj.pred,
                          venue.pred, terms.pred))
names(pred.df) <- c("Controls", "+ Affiliation", "+ Subject", "+ Venue",
                          "+ Terms") 

m.list <- list(m.con, m.aff, m.subj, m.venue, m.terms)

# collect:
# df
# percent.deviance
# nonzero coefficients
# prediction error

gen.m.summ.info <- function(model){
    df <- round(tail(model$glmnet.fit$df, 1),0)
    percent.dev <- round(tail(model$glmnet.fit$dev.ratio, 1),2)*100
    cv.error <- round(tail(model$cvm,1),2)*100
#    null.dev <- round(tail(model$glmnet.fit$nulldev),0)
    out <- c(df, percent.dev, cv.error)
    return(out)
}

gen.class.err <- function(pred, test){
    props <- prop.table(table(pred, test))
    err.sum <- round(sum(props[1,2], props[2,1]),2)*100
    return(err.sum)
}


results.tab <- cbind(names(pred.df),data.frame(matrix(unlist(lapply(m.list,
                                               gen.m.summ.info)),
                                 byrow=T, nrow=5)))

results.tab$class.err <- sapply(pred.df, function(x) gen.class.err(x, 
                                                                   Y.hold))

results.tab <- data.frame(lapply(results.tab, as.character))


names(results.tab) <- c("Model", "N features", "Deviance (%)",
                                               "CV error (%)", "Hold-back error (%)")


print(xtable(results.tab,
             caption=
                 "Summary of fitted models predicting any citations. The ``Model'' column describes which features were included. The N features column shows the number of features included in the prediction. ``Deviance'' summarizes the goodness of fit as a percentage of the total deviance accounted for by the model. ``CV error'' (cross-validation error) reports the prediction error rates of each model in the cross-validation procedure conducted as part of the parameter estimation process. ``Hold-back error'' shows the prediction error on a random 10 percent subset of the original dataset not included in any of the model estimation procedures.",
             label='tab:predict_models', align='llrrrr'),
             include.rownames=FALSE)

# Store the results:
predict.list$results.tab <- results.tab


############# Generate most salient coefficients
nz.coefs <- data.frame(                       coef =
                           colnames(X.test.terms)[which(
                                       coef(m.terms, s="lambda.min")
                                       != 0)],
                       type = "term",
                       beta =
                           coef(m.terms,
                                s="lambda.min")[which(coef(m.terms,
                                                           s="lambda.min")
                                                      != 0)])

nz.coefs$coef <- as.character(nz.coefs$coef)
nz.coefs$type <- as.character(nz.coefs$type)
nz.coefs <- nz.coefs[order(-abs(nz.coefs$beta)),]

# comparison:

#nz.coefs$type <- "terms"
nz.coefs$type[grepl("(Intercept)", nz.coefs$coef)] <- NA
nz.coefs$type[grepl("source_title", nz.coefs$coef)] <- "venue"
nz.coefs$type[grepl("subject.x", nz.coefs$coef)] <- "subject"
nz.coefs$type[grepl("affiliation", nz.coefs$coef)] <- "affiliation"
nz.coefs$type[grepl("month.x", nz.coefs$coef)] <- "month"
nz.coefs$type[grepl("modal_country", nz.coefs$coef)] <- "country"
nz.coefs$type[grepl("language", nz.coefs$coef)] <- "language"
nz.coefs$type[grepl("^20[0-9]{2}$", nz.coefs$coef)] <- "year"


# cleanup 
nz.coefs$coef <- gsub("source_title", "", nz.coefs$coef)
nz.coefs$coef <- gsub("subject.x", "", nz.coefs$coef)
nz.coefs$coef <- gsub("affiliation","", nz.coefs$coef)
nz.coefs$beta <- round(nz.coefs$beta, 3)
names(nz.coefs) <- c("Feature", "Type", "Coefficient")

predict.list$nz.coefs <- nz.coefs

# table for all
round(prop.table(table(nz.coefs$Type))*100, 2)

# for top subsets
round(prop.table(table(nz.coefs$Type[1:700]))*100, 2)
round(prop.table(table(nz.coefs$Type[1:200]))*100, 2)
round(prop.table(table(nz.coefs$Type[1:100]))*100, 2)

print(xtable(
    as.matrix(head(nz.coefs, 10)),
    label='tab:nzcoefs',
    caption='Feature, variable type, and beta value for top 100 non-zero coefficients estimated by the best fitting model with all features included.',
    align='lllr'
), include.rownames=FALSE)


# output
save(predict.list, file="paper/data/prediction.RData")