+ if max_df is not None:
+ ds_filter &= ds.field("count") <= max_df
+
+ term = term_colname
+ term_id = term + '_id'
+ term_id_new = term + '_id_new'
+
+ df = tfidf_ds.to_table(filter=ds_filter,columns=['subreddit','subreddit_id',term_id,'relative_tf']).to_pandas()
+
+ sub_ids = df.subreddit_id.drop_duplicates()
+ new_sub_ids = pd.DataFrame({'subreddit_id':old,'subreddit_id_new':new} for new, old in enumerate(sorted(sub_ids)))
+ df = df.merge(new_sub_ids,on='subreddit_id',how='inner',validate='many_to_one')
+
+ new_count = df.groupby(term_id)[term_id].aggregate(new_count='count').reset_index()
+ df = df.merge(new_count,on=term_id,how='inner',validate='many_to_one')
+
+ term_ids = df[term_id].drop_duplicates()
+ new_term_ids = pd.DataFrame({term_id:old,term_id_new:new} for new, old in enumerate(sorted(term_ids)))
+
+ df = df.merge(new_term_ids, on=term_id, validate='many_to_one')
+ N_docs = sub_ids.shape[0]
+
+ df['idf'] = np.log(N_docs/(1+df.new_count)) + 1
+
+ # agg terms by subreddit to make sparse tf/df vectors
+ if tf_family == tf_weight.MaxTF:
+ df["tf_idf"] = df.relative_tf * df.idf
+ else: # tf_fam = tf_weight.Norm05
+ df["tf_idf"] = (0.5 + 0.5 * df.relative_tf) * df.idf
+
+ subreddit_names = df.loc[:,['subreddit','subreddit_id_new']].drop_duplicates()