code/topic_modeling/01_make_paper_files.py

   1 '''Creates the figures and tables for LaTeX'''
   2
   3 import pandas as pd
   4 import numpy as np
   5 import datetime
   6 import argparse
   7 import os
   8
   9 topic_names = [
  10     'Media Use',
  11     'Social Network Analysis',
  12     'Consumer Analsyis',
  13     'Education',
  14     'Quantitative Analysis',
  15     'Information Spread',
  16     'Health',
  17     'Sentiment Analysis',
  18     'News',
  19     'HCI',
  20     'Influence',
  21     'Methodology'
  22 ]
  23
  24 def main():
  25
  26     parser = argparse.ArgumentParser(description='Takes the LDA info and top words and creates an RData file with summary statistics')
  27     parser.add_argument('-a', help='Abstracts LDA file',
  28             default='processed_data/abstracts_LDA.csv')
  29     parser.add_argument('-w', help='Top words file',
  30             default='processed_data/top_words.csv')
  31     parser.add_argument('-t', help='Topic tables directory',
  32             default='paper/tables/')
  33     parser.add_argument('-o', help = 'RData output file location',
  34             default = 'paper/data/topic_model_data.RData')
  35
  36     args = parser.parse_args()
  37
  38     # Make the top_words tables
  39     tw = pd.read_csv(args.w)
  40     # Add names
  41     tw.columns = topic_names
  42     # Save as 2 different tables, because they are too long
  43     if not os.path.exists(args.t):
  44         os.makedirs(args.t)
  45     tw.to_latex(args.t + 'topic_words1.tex',index=False, columns=tw.columns[:6])
  46     tw.to_latex(args.t + 'topic_words2.tex',index=False, columns=tw.columns[6:])
  47
  48     # Load the abstracts and topics data
  49     df = pd.read_csv(args.a)
  50     n_topics = len(tw.columns)
  51     # Change to datetime
  52     df.date = pd.to_datetime(df.date)
  53
  54     # Remove papers from 2016 since we don't have the entire year, so graphs are misleading
  55     df = df[df.date <= pd.to_datetime('2015-12-31')]
  56     df = df.set_index('date')
  57     # Rename the last columns as the topic names
  58     df.columns = list(df.columns[:-n_topics]) + topic_names
  59     # Group by year, and get only the LDA columns
  60     topics_by_year = df.groupby(lambda x: x.year)[df.columns[-n_topics:]]
  61     # Get summary statistics for each topic
  62     # Total amount published in each topic by year
  63     topic_sums = topics_by_year.sum()
  64     # Mean amount published in each topic
  65     topic_means = topics_by_year.mean()
  66     # Now, we weight the contributions by how much a paper has been cited.
  67     # Remember, each document has a distribution of topics that it belongs to, so a given document might look like:
  68     # T1: .5
  69     # T2: .3
  70     # T3: 0
  71     # T4: .2
  72     # To account for how influential a paper is, we take all of the topic columns for a document
  73     # and multiplies their weights by the logged citations the paper has received.
  74     citation_weighted_topics = df[df.columns[-n_topics:]]
  75     citation_weighted_topics = citation_weighted_topics.apply(lambda x: x * np.log1p(df.cited_by_count), axis=0)
  76     weighted_sums = citation_weighted_topics.groupby(lambda x: x.year).sum()
  77
  78     ## write data to R
  79     # import code to write r modules and create our variable we'll write to
  80     import rpy2.robjects as robjects
  81     from rpy2.robjects import pandas2ri
  82     pandas2ri.activate()
  83
  84
  85     r = {'weighted_sums' : weighted_sums,
  86          'topic_sums' : topic_sums,
  87          'topic_means' : topic_means }
  88
  89     for var_name, x in r.items():
  90         robjects.r.assign(var_name.replace("_", "."), x)
  91
  92     if not os.path.exists(os.path.dirname(args.o)):
  93         os.makedirs(os.path.dirname(args.o))
  94
  95     robjects.r('save({},file = "{}")'.format(
  96                                             ",".join([k.replace("_", ".") for k in r.keys()]),
  97                                             args.o
  98                                             ))
  99     robjects.r("rm(list=ls())")
 100
 101
 102 if __name__ == '__main__':
 103     main()