- if exclude_phrases == True:
- tfidf = tfidf.filter(~f.col(term_colname).contains("_"))
-
- print("creating temporary parquet with matrix indicies")
- tempdir = prep_tfidf_entries(tfidf, term_colname, min_df, included_subreddits)
- tfidf = spark.read.parquet(tempdir.name)
- subreddit_names = tfidf.select(['subreddit','subreddit_id_new']).distinct().toPandas()
- subreddit_names = subreddit_names.sort_values("subreddit_id_new")
- subreddit_names['subreddit_id_new'] = subreddit_names['subreddit_id_new'] - 1
- spark.stop()
-
- print("loading matrix")
- mat = read_tfidf_matrix(tempdir.name, term_colname)
- print('computing similarities')
- sims = column_similarities(mat)
- del mat
-
- sims = pd.DataFrame(sims.todense())
- sims = sims.rename({i:sr for i, sr in enumerate(subreddit_names.subreddit.values)}, axis=1)
- sims['subreddit'] = subreddit_names.subreddit.values
-
- p = Path(outfile)
-
- output_feather = Path(str(p).replace("".join(p.suffixes), ".feather"))
- output_csv = Path(str(p).replace("".join(p.suffixes), ".csv"))
- output_parquet = Path(str(p).replace("".join(p.suffixes), ".parquet"))
-
- sims.to_feather(outfile)
- tempdir.cleanup()
-
-def term_cosine_similarities(outfile, min_df=None, included_subreddits=None, topN=500, exclude_phrases=False):
- return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet',