+ df = tfidf_ds.to_table(filter=ds_filter,columns=['subreddit','subreddit_id',term_id,'relative_tf']).to_pandas()
+
+ sub_ids = df.subreddit_id.drop_duplicates()
+ new_sub_ids = pd.DataFrame({'subreddit_id':old,'subreddit_id_new':new} for new, old in enumerate(sorted(sub_ids)))
+ df = df.merge(new_sub_ids,on='subreddit_id',how='inner',validate='many_to_one')
+
+ new_count = df.groupby(term_id)[term_id].aggregate(new_count='count').reset_index()
+ df = df.merge(new_count,on=term_id,how='inner',validate='many_to_one')
+
+ term_ids = df[term_id].drop_duplicates()
+ new_term_ids = pd.DataFrame({term_id:old,term_id_new:new} for new, old in enumerate(sorted(term_ids)))
+
+ df = df.merge(new_term_ids, on=term_id, validate='many_to_one')
+ N_docs = sub_ids.shape[0]
+
+ df['idf'] = np.log(N_docs/(1+df.new_count)) + 1
+
+ # agg terms by subreddit to make sparse tf/df vectors
+ if tf_family == tf_weight.MaxTF:
+ df["tf_idf"] = df.relative_tf * df.idf
+ else: # tf_fam = tf_weight.Norm05
+ df["tf_idf"] = (0.5 + 0.5 * df.relative_tf) * df.idf
+
+ subreddit_names = df.loc[:,['subreddit','subreddit_id_new']].drop_duplicates()
+ subreddit_names = subreddit_names.sort_values("subreddit_id_new")
+ return(df, subreddit_names)
+
+
+def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None, tfidf_colname='tf_idf'):
+ '''
+ tfidf_colname: set to 'relative_tf' to use normalized term frequency instead of tf-idf, which can be useful for author-based similarities.
+ '''
+ if from_date is not None or to_date is not None:
+ tempdir, subreddit_names = reindex_tfidf_time_interval(infile, term_colname=term_colname, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=False, from_date=from_date, to_date=to_date)
+ mat = read_tfidf_matrix(tempdir.name, term_colname, tfidf_colname)
+ else:
+ entries, subreddit_names = reindex_tfidf(infile, term_colname=term_colname, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=False)
+ mat = csr_matrix((entries[tfidf_colname],(entries[term_id_new]-1, entries.subreddit_id_new-1)))
+
+ print("loading matrix")
+
+ # mat = read_tfidf_matrix("term_tfidf_entries7ejhvnvl.parquet", term_colname)
+
+ print(f'computing similarities on mat. mat.shape:{mat.shape}')
+ print(f"size of mat is:{mat.data.nbytes}")
+ sims = simfunc(mat)
+ del mat
+
+ if issparse(sims):
+ sims = sims.todense()
+
+ print(f"shape of sims:{sims.shape}")
+ print(f"len(subreddit_names.subreddit.values):{len(subreddit_names.subreddit.values)}")
+ sims = pd.DataFrame(sims)
+ sims = sims.rename({i:sr for i, sr in enumerate(subreddit_names.subreddit.values)}, axis=1)
+ sims['_subreddit'] = subreddit_names.subreddit.values
+
+ p = Path(outfile)
+
+ output_feather = Path(str(p).replace("".join(p.suffixes), ".feather"))
+ output_csv = Path(str(p).replace("".join(p.suffixes), ".csv"))
+ output_parquet = Path(str(p).replace("".join(p.suffixes), ".parquet"))
+
+ sims.to_feather(outfile)
+# tempdir.cleanup()
+
+def read_tfidf_matrix_weekly(path, term_colname, week, tfidf_colname='tf_idf'):
+ term = term_colname
+ term_id = term + '_id'
+ term_id_new = term + '_id_new'
+
+ dataset = ds.dataset(path,format='parquet')
+ entries = dataset.to_table(columns=[tfidf_colname,'subreddit_id_new', term_id_new],filter=ds.field('week')==week).to_pandas()
+ return(csr_matrix((entries[tfidf_colname], (entries[term_id_new]-1, entries.subreddit_id_new-1))))
+
+def read_tfidf_matrix(path, term_colname, tfidf_colname='tf_idf'):
+ term = term_colname
+ term_id = term + '_id'
+ term_id_new = term + '_id_new'