4 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
5 from sklearn.decomposition import NMF, LatentDirichletAllocation
12 This code was inspired/copied from http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html.
14 It takes in an abstract file, and creates two outputs: The abstracts together with their topic distribution and a set of topics and the top words associated with each.
17 n_samples = None # Enter an integer here for testing.
23 parser = argparse.ArgumentParser(description='Program to use LDA to create topics and topic distributions from a set of abstracts.')
24 parser.add_argument('-i', help='Abstracts file',
25 default='processed_data/abstracts.tsv')
26 parser.add_argument('-o', help='Where to output results',
27 default='processed_data/abstracts_LDA.csv')
28 parser.add_argument('-t', help='Where to output topics and top words associated with them',
29 default='processed_data/top_words.csv')
30 args = parser.parse_args()
32 print("Loading dataset...")
34 dataset, doc_data = get_abstracts(args.i)
35 data_samples = dataset[:n_samples]
36 doc_data = doc_data[:n_samples]
37 print("done in %0.3fs." % (time() - t0))
39 # Use tf (raw term count) features for LDA.
40 print("Extracting tf features for LDA...")
41 tf_vectorizer = CountVectorizer(max_df=0.95, # Terms that show up in > max_df of documents are ignored
42 min_df=2, # Terms that show up in < min_df of documents are ignored
43 max_features=n_features, # Only use the top max_features
47 tf = tf_vectorizer.fit_transform(data_samples)
48 print("done in %0.3fs." % (time() - t0))
51 print("Fitting LDA models with tf features, "
52 "n_samples=%d and n_features=%d..."
53 % (len(data_samples), n_features))
54 lda = LatentDirichletAllocation(n_components=n_topics, max_iter=5,
55 learning_method='online',
61 transformed_model = lda.fit_transform(tf)
62 print("done in %0.3fs." % (time() - t0))
65 # Change the values into a probability distribution for each abstract
66 topic_dist = [[topic/sum(abstract_topics) for topic in abstract_topics]
67 for abstract_topics in transformed_model]
69 # Make the topic distribution into a dataframe
70 td = pd.DataFrame(topic_dist)
71 # Get the feature names (i.e., the words/terms)
72 tf_feature_names = tf_vectorizer.get_feature_names()
75 # Get the top words by topic
76 topic_words = get_top_words(lda, tf_feature_names, 20)
77 # Sort by how often topic is used
78 topic_words = topic_words.reindex_axis(sorted(topic_words.columns, key = lambda x: td[x].sum(), reverse=True),axis=1)
80 # Rearrange the columns by how often each topic is used
81 td = td.reindex_axis(sorted(td.columns, key = lambda x: td[x].sum(), reverse=True),axis=1)
83 topic_words.to_csv(args.t, index=False)
85 df = pd.DataFrame(doc_data)
88 df.to_csv(args.o, index=False)
90 def get_abstracts(fn):
91 with open(fn, 'r') as f:
92 in_csv = csv.DictReader(f, delimiter='\t')
97 curr_abstract = r['abstract']
98 # If this isn't really an abstract, then don't add it
99 if len(curr_abstract) > 5:
100 # Add the abstracts to the corpus, and save the data
101 abstracts.append(r['abstract'])
105 return abstracts, doc_data
107 def get_top_words(model, feature_names, n_top_words):
108 '''Takes the model, the words used, and the number of words requested.
109 Returns a dataframe of the top n_top_words for each topic'''
112 for i, topic in enumerate(model.components_):
113 # Get the top feature names, and put them in that column
114 r[i] = [add_quotes(feature_names[i])
115 for i in topic.argsort()[:-n_top_words - 1:-1]]
119 '''Adds quotes around multiple term phrases'''
125 if __name__ == '__main__':