1 '''Creates the figures and tables for LaTeX'''
11 'Social Network Analysis',
14 'Quantitative Analysis',
26 parser = argparse.ArgumentParser(description='Takes the LDA info and top words and creates an RData file with summary statistics')
27 parser.add_argument('-a', help='Abstracts LDA file',
28 default='processed_data/abstracts_LDA.csv')
29 parser.add_argument('-w', help='Top words file',
30 default='processed_data/top_words.csv')
31 parser.add_argument('-t', help='Topic tables directory',
32 default='paper/tables/')
33 parser.add_argument('-o', help = 'RData output file location',
34 default = 'paper/data/topic_model_data.RData')
36 args = parser.parse_args()
38 # Make the top_words tables
39 tw = pd.read_csv(args.w)
41 tw.columns = topic_names
42 # Save as 2 different tables, because they are too long
43 if not os.path.exists(args.t):
45 tw.to_latex(args.t + 'topic_words1.tex',index=False, columns=tw.columns[:6])
46 tw.to_latex(args.t + 'topic_words2.tex',index=False, columns=tw.columns[6:])
48 # Load the abstracts and topics data
49 df = pd.read_csv(args.a)
50 n_topics = len(tw.columns)
52 df.date = pd.to_datetime(df.date)
54 # Remove papers from 2016 since we don't have the entire year, so graphs are misleading
55 df = df[df.date <= pd.to_datetime('2015-12-31')]
56 df = df.set_index('date')
57 # Rename the last columns as the topic names
58 df.columns = list(df.columns[:-n_topics]) + topic_names
59 # Group by year, and get only the LDA columns
60 topics_by_year = df.groupby(lambda x: x.year)[df.columns[-n_topics:]]
61 # Get summary statistics for each topic
62 # Total amount published in each topic by year
63 topic_sums = topics_by_year.sum()
64 # Mean amount published in each topic
65 topic_means = topics_by_year.mean()
66 # Now, we weight the contributions by how much a paper has been cited.
67 # Remember, each document has a distribution of topics that it belongs to, so a given document might look like:
72 # To account for how influential a paper is, we take all of the topic columns for a document
73 # and multiplies their weights by the logged citations the paper has received.
74 citation_weighted_topics = df[df.columns[-n_topics:]]
75 citation_weighted_topics = citation_weighted_topics.apply(lambda x: x * np.log1p(df.cited_by_count), axis=0)
76 weighted_sums = citation_weighted_topics.groupby(lambda x: x.year).sum()
79 # import code to write r modules and create our variable we'll write to
80 import rpy2.robjects as robjects
81 from rpy2.robjects import pandas2ri
85 r = {'weighted_sums' : weighted_sums,
86 'topic_sums' : topic_sums,
87 'topic_means' : topic_means }
89 for var_name, x in r.items():
90 robjects.r.assign(var_name.replace("_", "."), x)
92 if not os.path.exists(os.path.dirname(args.o)):
93 os.makedirs(os.path.dirname(args.o))
95 robjects.r('save({},file = "{}")'.format(
96 ",".join([k.replace("_", ".") for k in r.keys()]),
99 robjects.r("rm(list=ls())")
102 if __name__ == '__main__':