code/topic_modeling/00_topics_extraction.py

   1
   2 from time import time
   3
   4 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
   5 from sklearn.decomposition import NMF, LatentDirichletAllocation
   6 import sys
   7 import csv
   8 import pandas as pd
   9 import argparse
  10
  11 """
  12 This code was inspired/copied from http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html.
  13
  14 It takes in an abstract file, and creates two outputs: The abstracts together with their topic distribution and a set of topics and the top words associated with each.
  15 """
  16
  17 n_samples = None # Enter an integer here for testing.
  18 n_features = 20000
  19 n_topics = 12
  20
  21 def main():
  22
  23     parser = argparse.ArgumentParser(description='Program to use LDA to create topics and topic distributions from a set of abstracts.')
  24     parser.add_argument('-i', help='Abstracts file',
  25             default='processed_data/abstracts.tsv')
  26     parser.add_argument('-o', help='Where to output  results',
  27             default='processed_data/abstracts_LDA.csv')
  28     parser.add_argument('-t', help='Where to output topics and top words associated with them',
  29             default='processed_data/top_words.csv')
  30     args = parser.parse_args()
  31
  32     print("Loading dataset...")
  33     t0 = time()
  34     dataset, doc_data = get_abstracts(args.i)
  35     data_samples = dataset[:n_samples]
  36     doc_data = doc_data[:n_samples]
  37     print("done in %0.3fs." % (time() - t0))
  38
  39     # Use tf (raw term count) features for LDA.
  40     print("Extracting tf features for LDA...")
  41     tf_vectorizer = CountVectorizer(max_df=0.95, # Terms that show up in > max_df of documents are ignored
  42                                     min_df=2, # Terms that show up in < min_df of documents are ignored
  43                                     max_features=n_features, # Only use the top max_features
  44                                     stop_words='english',
  45                                     ngram_range=(1,2))
  46     t0 = time()
  47     tf = tf_vectorizer.fit_transform(data_samples)
  48     print("done in %0.3fs." % (time() - t0))
  49
  50
  51     print("Fitting LDA models with tf features, "
  52           "n_samples=%d and n_features=%d..."
  53           % (len(data_samples), n_features))
  54     lda = LatentDirichletAllocation(n_components=n_topics, max_iter=5,
  55                                     learning_method='online',
  56                                     learning_offset=50.,
  57                                     random_state=2017,
  58                                     n_jobs=2)
  59     t0 = time()
  60     model = lda.fit(tf)
  61     transformed_model = lda.fit_transform(tf)
  62     print("done in %0.3fs." % (time() - t0))
  63
  64
  65     # Change the values into a probability distribution for each abstract
  66     topic_dist = [[topic/sum(abstract_topics) for topic in abstract_topics]
  67                           for abstract_topics in transformed_model]
  68
  69     # Make the topic distribution into a dataframe
  70     td = pd.DataFrame(topic_dist)
  71     # Get the feature names (i.e., the words/terms)
  72     tf_feature_names = tf_vectorizer.get_feature_names()
  73
  74
  75     # Get the top words by topic
  76     topic_words = get_top_words(lda, tf_feature_names, 20)
  77     # Sort by how often topic is used
  78     topic_words = topic_words.reindex_axis(sorted(topic_words.columns, key = lambda x: td[x].sum(), reverse=True),axis=1)
  79
  80     # Rearrange the columns by how often each topic is used
  81     td = td.reindex_axis(sorted(td.columns, key = lambda x: td[x].sum(), reverse=True),axis=1)
  82
  83     topic_words.to_csv(args.t, index=False)
  84
  85     df = pd.DataFrame(doc_data)
  86     df = df.join(td)
  87
  88     df.to_csv(args.o, index=False)
  89
  90 def get_abstracts(fn):
  91     with open(fn, 'r') as f:
  92         in_csv = csv.DictReader(f, delimiter='\t')
  93         abstracts = []
  94         doc_data = []
  95         for r in in_csv:
  96             try:
  97                 curr_abstract = r['abstract']
  98                 # If this isn't really an abstract, then don't add it
  99                 if len(curr_abstract) > 5:
 100                     # Add the abstracts to the corpus, and save the data
 101                     abstracts.append(r['abstract'])
 102                     doc_data.append(r)
 103             except KeyError:
 104                 print(r)
 105     return abstracts, doc_data
 106
 107 def get_top_words(model, feature_names, n_top_words):
 108     '''Takes the model, the words used, and the number of words requested.
 109     Returns a dataframe of the top n_top_words for each topic'''
 110     r = pd.DataFrame()
 111     # For each topic
 112     for i, topic in enumerate(model.components_):
 113         # Get the top feature names, and put them in that column
 114         r[i] = [add_quotes(feature_names[i])
 115                     for i in topic.argsort()[:-n_top_words - 1:-1]]
 116     return r
 117
 118 def add_quotes(s):
 119     '''Adds quotes around multiple term phrases'''
 120     if " " in s:
 121         s =  '"{}"'.format(s)
 122     return s
 123
 124
 125 if __name__ == '__main__':
 126     main()