from time import time from sklearn.feature_extraction.text import CountVectorizer import csv import argparse n_features = 100000 # Gets the top n_features terms n_samples = None # Enter an integer here for testing, so it doesn't take so long def main(): parser = argparse.ArgumentParser(description='Take in abstracts, output CSV of n-gram counts') parser.add_argument('-i', help='Location of the abstracts file', default='processed_data/abstracts.tsv') parser.add_argument('-o', help='Location of the output file', default='processed_data/ngram_table.csv') parser.add_argument('-n', type=int, help='Gets from 1 to n ngrams', default=3) args = parser.parse_args() print("Loading dataset...") t0 = time() doc_ids, data_samples = get_ids_and_abstracts(args.i, n_samples) print("done in %0.3fs." % (time() - t0)) # Write the header write_header(args.o) bags_o_words = get_counts(data_samples, n_features, args.n) write_output(doc_ids, bags_o_words, args.o) def get_counts(abstracts, n_features, ngram_max): tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english', ngram_range = (1,ngram_max)) t0 = time() tf = tf_vectorizer.fit_transform(abstracts) print("done in %0.3fs." % (time() - t0)) terms = tf_vectorizer.get_feature_names() freqs = tf.toarray() bags_o_words = to_bags_o_words(terms, freqs) return bags_o_words def write_header(out_file): with open(out_file, 'w') as o_f: out = csv.writer(o_f) out.writerow(['document_id','term','frequency']) def to_bags_o_words(terms, freqs): '''Takes in the vectorizer stuff, and returns a list of dictionaries, one for each document. The format of the dictionaries is term:count within that document. ''' result = [] for d in freqs: curr_result = {terms[i]:val for i,val in enumerate(d) if val > 0 } result.append(curr_result) return result def write_output(ids, bags_o_words, out_file): with open(out_file, 'a') as o_f: out = csv.writer(o_f) for i, doc in enumerate(bags_o_words): for k,v in doc.items(): # For each term and count, output a row, together with the document id out.writerow([ids[i],k,v]) def get_ids_and_abstracts(fn, length_limit): with open(fn, 'r') as f: in_csv = csv.DictReader(f, delimiter='\t') abstracts = [] ids = [] i = 1 for r in in_csv: try: abstracts.append(r['abstract']) ids.append(r['eid']) except KeyError: print(r) if length_limit and i > length_limit: break i += 1 return ids, abstracts if __name__ == '__main__': main()