'''Creates the figures and tables for LaTeX''' import pandas as pd import numpy as np import datetime import argparse import os topic_names = [ 'Media Use', 'Social Network Analysis', 'Consumer Analsyis', 'Education', 'Quantitative Analysis', 'Information Spread', 'Health', 'Sentiment Analysis', 'News', 'HCI', 'Influence', 'Methodology' ] def main(): parser = argparse.ArgumentParser(description='Takes the LDA info and top words and creates an RData file with summary statistics') parser.add_argument('-a', help='Abstracts LDA file', default='processed_data/abstracts_LDA.csv') parser.add_argument('-w', help='Top words file', default='processed_data/top_words.csv') parser.add_argument('-t', help='Topic tables directory', default='paper/tables/') parser.add_argument('-o', help = 'RData output file location', default = 'paper/data/topic_model_data.RData') args = parser.parse_args() # Make the top_words tables tw = pd.read_csv(args.w) # Add names tw.columns = topic_names # Save as 2 different tables, because they are too long if not os.path.exists(args.t): os.makedirs(args.t) tw.to_latex(args.t + 'topic_words1.tex',index=False, columns=tw.columns[:6]) tw.to_latex(args.t + 'topic_words2.tex',index=False, columns=tw.columns[6:]) # Load the abstracts and topics data df = pd.read_csv(args.a) n_topics = len(tw.columns) # Change to datetime df.date = pd.to_datetime(df.date) # Remove papers from 2016 since we don't have the entire year, so graphs are misleading df = df[df.date <= pd.to_datetime('2015-12-31')] df = df.set_index('date') # Rename the last columns as the topic names df.columns = list(df.columns[:-n_topics]) + topic_names # Group by year, and get only the LDA columns topics_by_year = df.groupby(lambda x: x.year)[df.columns[-n_topics:]] # Get summary statistics for each topic # Total amount published in each topic by year topic_sums = topics_by_year.sum() # Mean amount published in each topic topic_means = topics_by_year.mean() # Now, we weight the contributions by how much a paper has been cited. # Remember, each document has a distribution of topics that it belongs to, so a given document might look like: # T1: .5 # T2: .3 # T3: 0 # T4: .2 # To account for how influential a paper is, we take all of the topic columns for a document # and multiplies their weights by the logged citations the paper has received. citation_weighted_topics = df[df.columns[-n_topics:]] citation_weighted_topics = citation_weighted_topics.apply(lambda x: x * np.log1p(df.cited_by_count), axis=0) weighted_sums = citation_weighted_topics.groupby(lambda x: x.year).sum() ## write data to R # import code to write r modules and create our variable we'll write to import rpy2.robjects as robjects from rpy2.robjects import pandas2ri pandas2ri.activate() r = {'weighted_sums' : weighted_sums, 'topic_sums' : topic_sums, 'topic_means' : topic_means } for var_name, x in r.items(): robjects.r.assign(var_name.replace("_", "."), x) if not os.path.exists(os.path.dirname(args.o)): os.makedirs(os.path.dirname(args.o)) robjects.r('save({},file = "{}")'.format( ",".join([k.replace("_", ".") for k in r.keys()]), args.o )) robjects.r("rm(list=ls())") if __name__ == '__main__': main()