3 from sklearn.feature_extraction.text import CountVectorizer
7 n_features = 100000 # Gets the top n_features terms
8 n_samples = None # Enter an integer here for testing, so it doesn't take so long
12 parser = argparse.ArgumentParser(description='Take in abstracts, output CSV of n-gram counts')
13 parser.add_argument('-i', help='Location of the abstracts file',
14 default='processed_data/abstracts.tsv')
15 parser.add_argument('-o', help='Location of the output file',
16 default='processed_data/ngram_table.csv')
17 parser.add_argument('-n', type=int, help='Gets from 1 to n ngrams',
20 args = parser.parse_args()
22 print("Loading dataset...")
24 doc_ids, data_samples = get_ids_and_abstracts(args.i, n_samples)
25 print("done in %0.3fs." % (time() - t0))
30 bags_o_words = get_counts(data_samples, n_features, args.n)
31 write_output(doc_ids, bags_o_words, args.o)
33 def get_counts(abstracts, n_features, ngram_max):
34 tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
35 max_features=n_features,
37 ngram_range = (1,ngram_max))
39 tf = tf_vectorizer.fit_transform(abstracts)
40 print("done in %0.3fs." % (time() - t0))
42 terms = tf_vectorizer.get_feature_names()
44 bags_o_words = to_bags_o_words(terms, freqs)
48 def write_header(out_file):
49 with open(out_file, 'w') as o_f:
51 out.writerow(['document_id','term','frequency'])
53 def to_bags_o_words(terms, freqs):
54 '''Takes in the vectorizer stuff, and returns a list of dictionaries, one for each document.
55 The format of the dictionaries is term:count within that document.
59 curr_result = {terms[i]:val for i,val in enumerate(d) if val > 0 }
60 result.append(curr_result)
63 def write_output(ids, bags_o_words, out_file):
64 with open(out_file, 'a') as o_f:
66 for i, doc in enumerate(bags_o_words):
67 for k,v in doc.items():
68 # For each term and count, output a row, together with the document id
69 out.writerow([ids[i],k,v])
71 def get_ids_and_abstracts(fn, length_limit):
72 with open(fn, 'r') as f:
73 in_csv = csv.DictReader(f, delimiter='\t')
79 abstracts.append(r['abstract'])
83 if length_limit and i > length_limit:
88 if __name__ == '__main__':