from time import time from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.decomposition import NMF, LatentDirichletAllocation import sys import csv import pandas as pd import argparse """ This code was inspired/copied from http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html. It takes in an abstract file, and creates two outputs: The abstracts together with their topic distribution and a set of topics and the top words associated with each. """ n_samples = None # Enter an integer here for testing. n_features = 20000 n_topics = 12 def main(): parser = argparse.ArgumentParser(description='Program to use LDA to create topics and topic distributions from a set of abstracts.') parser.add_argument('-i', help='Abstracts file', default='processed_data/abstracts.tsv') parser.add_argument('-o', help='Where to output results', default='processed_data/abstracts_LDA.csv') parser.add_argument('-t', help='Where to output topics and top words associated with them', default='processed_data/top_words.csv') args = parser.parse_args() print("Loading dataset...") t0 = time() dataset, doc_data = get_abstracts(args.i) data_samples = dataset[:n_samples] doc_data = doc_data[:n_samples] print("done in %0.3fs." % (time() - t0)) # Use tf (raw term count) features for LDA. print("Extracting tf features for LDA...") tf_vectorizer = CountVectorizer(max_df=0.95, # Terms that show up in > max_df of documents are ignored min_df=2, # Terms that show up in < min_df of documents are ignored max_features=n_features, # Only use the top max_features stop_words='english', ngram_range=(1,2)) t0 = time() tf = tf_vectorizer.fit_transform(data_samples) print("done in %0.3fs." % (time() - t0)) print("Fitting LDA models with tf features, " "n_samples=%d and n_features=%d..." % (len(data_samples), n_features)) lda = LatentDirichletAllocation(n_components=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=2017, n_jobs=2) t0 = time() model = lda.fit(tf) transformed_model = lda.fit_transform(tf) print("done in %0.3fs." % (time() - t0)) # Change the values into a probability distribution for each abstract topic_dist = [[topic/sum(abstract_topics) for topic in abstract_topics] for abstract_topics in transformed_model] # Make the topic distribution into a dataframe td = pd.DataFrame(topic_dist) # Get the feature names (i.e., the words/terms) tf_feature_names = tf_vectorizer.get_feature_names() # Get the top words by topic topic_words = get_top_words(lda, tf_feature_names, 20) # Sort by how often topic is used topic_words = topic_words.reindex_axis(sorted(topic_words.columns, key = lambda x: td[x].sum(), reverse=True),axis=1) # Rearrange the columns by how often each topic is used td = td.reindex_axis(sorted(td.columns, key = lambda x: td[x].sum(), reverse=True),axis=1) topic_words.to_csv(args.t, index=False) df = pd.DataFrame(doc_data) df = df.join(td) df.to_csv(args.o, index=False) def get_abstracts(fn): with open(fn, 'r') as f: in_csv = csv.DictReader(f, delimiter='\t') abstracts = [] doc_data = [] for r in in_csv: try: curr_abstract = r['abstract'] # If this isn't really an abstract, then don't add it if len(curr_abstract) > 5: # Add the abstracts to the corpus, and save the data abstracts.append(r['abstract']) doc_data.append(r) except KeyError: print(r) return abstracts, doc_data def get_top_words(model, feature_names, n_top_words): '''Takes the model, the words used, and the number of words requested. Returns a dataframe of the top n_top_words for each topic''' r = pd.DataFrame() # For each topic for i, topic in enumerate(model.components_): # Get the top feature names, and put them in that column r[i] = [add_quotes(feature_names[i]) for i in topic.argsort()[:-n_top_words - 1:-1]] return r def add_quotes(s): '''Adds quotes around multiple term phrases''' if " " in s: s = '"{}"'.format(s) return s if __name__ == '__main__': main()