support passing in list of tfidf vectors.

[cdsc_reddit.git] / similarities / similarities_helper.py
diff --git a/similarities/similarities_helper.py b/similarities/similarities_helper.py

index 69516a6abfd69b28e70f7bbf02ec923745da10b3..3ace8f29f3922838009adfb4dccf77e5f03b1e34 100644 (file)
--- a/similarities/similarities_helper.py
+++ b/similarities/similarities_helper.py
@@ -60,7 +60,7 @@ def reindex_tfidf(infile, term_colname, min_df=None, max_df=None, included_subre
      if included_subreddits is None:
          included_subreddits = select_topN_subreddits(topN)
      else:
-        included_subreddits = set(open(included_subreddits))
+        included_subreddits = set(map(str.strip,map(str.lower,open(included_subreddits))))
  
      if exclude_phrases == True:
          tfidf = tfidf.filter(~f.col(term_colname).contains("_"))
@@ -75,18 +75,22 @@ def reindex_tfidf(infile, term_colname, min_df=None, max_df=None, included_subre
      spark.stop()
      return (tempdir, subreddit_names)
  
-def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None):
  
+def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None, tfidf_colname='tf_idf'):
+    '''
+    tfidf_colname: set to 'relative_tf' to use normalized term frequency instead of tf-idf, which can be useful for author-based similarities.
+    '''
      if from_date is not None or to_date is not None:
-        tempdir, subreddit_names = reindex_tfidf_time_interval(infile, term_colname='author', min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=False, from_date=from_date, to_date=to_date)
+        tempdir, subreddit_names = reindex_tfidf_time_interval(infile, term_colname=term_colname, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=False, from_date=from_date, to_date=to_date)
          
      else:
-        tempdir, subreddit_names = reindex_tfidf(infile, term_colname='author', min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=False)
+        tempdir, subreddit_names = reindex_tfidf(infile, term_colname=term_colname, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=False)
  
      print("loading matrix")
      #    mat = read_tfidf_matrix("term_tfidf_entries7ejhvnvl.parquet", term_colname)
-    mat = read_tfidf_matrix(tempdir.name, term_colname)
-    print('computing similarities')
+    mat = read_tfidf_matrix(tempdir.name, term_colname, tfidf_colname)
+    print(f'computing similarities on mat. mat.shape:{mat.shape}')
+    print(f"size of mat is:{mat.data.nbytes}")
      sims = simfunc(mat)
      del mat
  
@@ -108,14 +112,24 @@ def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=Non
      sims.to_feather(outfile)
      tempdir.cleanup()
  
-def read_tfidf_matrix_weekly(path, term_colname, week):
+def read_tfidf_matrix_weekly(path, term_colname, week, tfidf_colname='tf_idf'):
      term = term_colname
      term_id = term + '_id'
      term_id_new = term + '_id_new'
  
      dataset = ds.dataset(path,format='parquet')
-    entries = dataset.to_table(columns=['tf_idf','subreddit_id_new',term_id_new],filter=ds.field('week')==week).to_pandas()
-    return(csr_matrix((entries.tf_idf,(entries[term_id_new]-1, entries.subreddit_id_new-1))))
+    entries = dataset.to_table(columns=[tfidf_colname,'subreddit_id_new', term_id_new],filter=ds.field('week')==week).to_pandas()
+    return(csr_matrix((entries[tfidf_colname], (entries[term_id_new]-1, entries.subreddit_id_new-1))))
+
+def read_tfidf_matrix(path, term_colname, tfidf_colname='tf_idf'):
+    term = term_colname
+    term_id = term + '_id'
+    term_id_new = term + '_id_new'
+    dataset = ds.dataset(path,format='parquet')
+    print(f"tfidf_colname:{tfidf_colname}")
+    entries = dataset.to_table(columns=[tfidf_colname, 'subreddit_id_new',term_id_new]).to_pandas()
+    return(csr_matrix((entries[tfidf_colname],(entries[term_id_new]-1, entries.subreddit_id_new-1))))
+    
  
  def write_weekly_similarities(path, sims, week, names):
      sims['week'] = week
@@ -127,15 +141,6 @@ def write_weekly_similarities(path, sims, week, names):
      sims = sims.melt(id_vars=['subreddit','week'],value_vars=names.subreddit.values)
      sims.to_parquet(p / week.isoformat())
  
-def read_tfidf_matrix(path,term_colname):
-    term = term_colname
-    term_id = term + '_id'
-    term_id_new = term + '_id_new'
-
-    dataset = ds.dataset(path,format='parquet')
-    entries = dataset.to_table(columns=['tf_idf','subreddit_id_new',term_id_new]).to_pandas()
-    return(csr_matrix((entries.tf_idf,(entries[term_id_new]-1, entries.subreddit_id_new-1))))
-    
  def column_overlaps(mat):
      non_zeros = (mat != 0).astype('double')
      
@@ -383,7 +388,7 @@ def build_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm
  
      return df
  
-def select_topN_subreddits(topN, path="/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv"):
+def select_topN_subreddits(topN, path="/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments_nonsfw.csv"):
      rankdf = pd.read_csv(path)
      included_subreddits = set(rankdf.loc[rankdf.comments_rank <= topN,'subreddit'].values)
      return included_subreddits