- sim_dist = sim_dist.repartition(1)
- sim_dist.write.parquet(str(output_parquet),mode='overwrite',compression='snappy')
+ sims.to_feather(outfile)
+ tempdir.cleanup()
+
+ # print(outfile)
+
+ # tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf_authors.parquet')
+
+ # if included_subreddits is None:
+ # included_subreddits = list(islice(open("/gscratch/comdata/users/nathante/cdsc-reddit/top_25000_subs_by_comments.txt"),topN))
+ # included_subreddits = {s.strip('\n') for s in included_subreddits}
+
+ # else:
+ # included_subreddits = set(open(included_subreddits))
+
+ # sim_dist, tfidf = cosine_similarities(tfidf, 'author', min_df, included_subreddits, similarity_threshold)
+
+ # p = Path(outfile)
+
+ # output_feather = Path(str(p).replace("".join(p.suffixes), ".feather"))
+ # output_csv = Path(str(p).replace("".join(p.suffixes), ".csv"))
+ # output_parquet = Path(str(p).replace("".join(p.suffixes), ".parquet"))
+ # sim_dist = sim_dist.entries.toDF()
+
+ # sim_dist = sim_dist.repartition(1)
+ # sim_dist.write.parquet(str(output_parquet),mode='overwrite',compression='snappy')