X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/blobdiff_plain/4e20dce18834f7276776a1ab824ff95e8c44ef99..806cfc948f2721bf2ea754bcf0c3b8b0e3d0adf8:/similarities/similarities_helper.py?ds=sidebyside diff --git a/similarities/similarities_helper.py b/similarities/similarities_helper.py index 69516a6..3ace8f2 100644 --- a/similarities/similarities_helper.py +++ b/similarities/similarities_helper.py @@ -60,7 +60,7 @@ def reindex_tfidf(infile, term_colname, min_df=None, max_df=None, included_subre if included_subreddits is None: included_subreddits = select_topN_subreddits(topN) else: - included_subreddits = set(open(included_subreddits)) + included_subreddits = set(map(str.strip,map(str.lower,open(included_subreddits)))) if exclude_phrases == True: tfidf = tfidf.filter(~f.col(term_colname).contains("_")) @@ -75,18 +75,22 @@ def reindex_tfidf(infile, term_colname, min_df=None, max_df=None, included_subre spark.stop() return (tempdir, subreddit_names) -def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None): +def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None, tfidf_colname='tf_idf'): + ''' + tfidf_colname: set to 'relative_tf' to use normalized term frequency instead of tf-idf, which can be useful for author-based similarities. + ''' if from_date is not None or to_date is not None: - tempdir, subreddit_names = reindex_tfidf_time_interval(infile, term_colname='author', min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=False, from_date=from_date, to_date=to_date) + tempdir, subreddit_names = reindex_tfidf_time_interval(infile, term_colname=term_colname, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=False, from_date=from_date, to_date=to_date) else: - tempdir, subreddit_names = reindex_tfidf(infile, term_colname='author', min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=False) + tempdir, subreddit_names = reindex_tfidf(infile, term_colname=term_colname, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=False) print("loading matrix") # mat = read_tfidf_matrix("term_tfidf_entries7ejhvnvl.parquet", term_colname) - mat = read_tfidf_matrix(tempdir.name, term_colname) - print('computing similarities') + mat = read_tfidf_matrix(tempdir.name, term_colname, tfidf_colname) + print(f'computing similarities on mat. mat.shape:{mat.shape}') + print(f"size of mat is:{mat.data.nbytes}") sims = simfunc(mat) del mat @@ -108,14 +112,24 @@ def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=Non sims.to_feather(outfile) tempdir.cleanup() -def read_tfidf_matrix_weekly(path, term_colname, week): +def read_tfidf_matrix_weekly(path, term_colname, week, tfidf_colname='tf_idf'): term = term_colname term_id = term + '_id' term_id_new = term + '_id_new' dataset = ds.dataset(path,format='parquet') - entries = dataset.to_table(columns=['tf_idf','subreddit_id_new',term_id_new],filter=ds.field('week')==week).to_pandas() - return(csr_matrix((entries.tf_idf,(entries[term_id_new]-1, entries.subreddit_id_new-1)))) + entries = dataset.to_table(columns=[tfidf_colname,'subreddit_id_new', term_id_new],filter=ds.field('week')==week).to_pandas() + return(csr_matrix((entries[tfidf_colname], (entries[term_id_new]-1, entries.subreddit_id_new-1)))) + +def read_tfidf_matrix(path, term_colname, tfidf_colname='tf_idf'): + term = term_colname + term_id = term + '_id' + term_id_new = term + '_id_new' + dataset = ds.dataset(path,format='parquet') + print(f"tfidf_colname:{tfidf_colname}") + entries = dataset.to_table(columns=[tfidf_colname, 'subreddit_id_new',term_id_new]).to_pandas() + return(csr_matrix((entries[tfidf_colname],(entries[term_id_new]-1, entries.subreddit_id_new-1)))) + def write_weekly_similarities(path, sims, week, names): sims['week'] = week @@ -127,15 +141,6 @@ def write_weekly_similarities(path, sims, week, names): sims = sims.melt(id_vars=['subreddit','week'],value_vars=names.subreddit.values) sims.to_parquet(p / week.isoformat()) -def read_tfidf_matrix(path,term_colname): - term = term_colname - term_id = term + '_id' - term_id_new = term + '_id_new' - - dataset = ds.dataset(path,format='parquet') - entries = dataset.to_table(columns=['tf_idf','subreddit_id_new',term_id_new]).to_pandas() - return(csr_matrix((entries.tf_idf,(entries[term_id_new]-1, entries.subreddit_id_new-1)))) - def column_overlaps(mat): non_zeros = (mat != 0).astype('double') @@ -383,7 +388,7 @@ def build_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm return df -def select_topN_subreddits(topN, path="/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv"): +def select_topN_subreddits(topN, path="/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments_nonsfw.csv"): rankdf = pd.read_csv(path) included_subreddits = set(rankdf.loc[rankdf.comments_rank <= topN,'subreddit'].values) return included_subreddits