Merge branch 'excise_reindex' of code:cdsc_reddit into excise_reindex

author Nathan TeBlunthuis <nathante@uw.edu>

Tue, 3 Aug 2021 22:13:21 +0000 (15:13 -0700)

committer Nathan TeBlunthuis <nathante@uw.edu>

Tue, 3 Aug 2021 22:13:21 +0000 (15:13 -0700)
author Nathan TeBlunthuis <nathante@uw.edu>
Tue, 3 Aug 2021 22:13:21 +0000 (15:13 -0700)
committer Nathan TeBlunthuis <nathante@uw.edu>
Tue, 3 Aug 2021 22:13:21 +0000 (15:13 -0700)
diff --git a/clustering/Makefile b/clustering/Makefile

index 69c6c15ba6280e4cac64dd6d734eb1f7f26bf086..9643f52842fa9e5f17ec3447a4e474e1aab8f669 100644 (file)
--- a/clustering/Makefile
+++ b/clustering/Makefile
@@ -2,9 +2,9 @@
  srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh
  similarity_data=/gscratch/comdata/output/reddit_similarity
  clustering_data=/gscratch/comdata/output/reddit_clustering
-kmeans_selection_grid="--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000]"
-hdbscan_selection_grid="--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=eom,leaf"
-affinity_selection_grid="--dampings=[0.5,0.6,0.7,0.8,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[15]"
+kmeans_selection_grid=--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000]
+hdbscan_selection_grid=--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf]
+affinity_selection_grid=--dampings=[0.5,0.6,0.7,0.8,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[15]
  
  authors_10k_input=$(similarity_data)/subreddit_comment_authors_10k.feather
  authors_10k_input_lsi=$(similarity_data)/subreddit_comment_authors_10k_LSI
@@ -91,7 +91,11 @@ ${terms_10k_output_lsi}/hdbscan/selection_data.csv:selection.py ${terms_10k_inpu
  ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py hdbscan_clustering.py
         $(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/hdbscan --savefile=${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid)
  
+${terms_10k_output_lsi}/best_hdbscan.feather:${terms_10k_output_lsi}/hdbscan/selection_data.csv pick_best_clustering.py
+       $(srun_singularity) python3 pick_best_clustering.py $< $@ --min_clusters=50 --max_isolates=5000 --min_cluster_size=2
  
+${authors_tf_10k_output_lsi}/best_hdbscan.feather:${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv pick_best_clustering.py
+       $(srun_singularity) python3 pick_best_clustering.py $< $@ --min_clusters=50 --max_isolates=5000 --min_cluster_size=2
  
  clean_affinity:
         rm -f ${authors_10k_output}/affinity/selection_data.csv
diff --git a/clustering/grid_sweep.py b/clustering/grid_sweep.py

index 636dcbc4c378d561b446f07e1c2c9d91dcf51ab2..c0365d041480394b8cd95d258ea1279c6580c2a9 100644 (file)
--- a/clustering/grid_sweep.py
+++ b/clustering/grid_sweep.py
@@ -7,6 +7,7 @@ class grid_sweep:
      def __init__(self, jobtype, inpath, outpath, namer, *args):
          self.jobtype = jobtype
          self.namer = namer
+        print(*args)
          grid = list(product(*args))
          inpath = Path(inpath)
          outpath = Path(outpath)
diff --git a/clustering/hdbscan_clustering_lsi.py b/clustering/hdbscan_clustering_lsi.py

index 73b5276712436cc2376bb6fd6252297b49f364eb..cbd44bde8a995f2e2f0b0e9066f0d06331de6fa4 100644 (file)
--- a/clustering/hdbscan_clustering_lsi.py
+++ b/clustering/hdbscan_clustering_lsi.py
@@ -59,7 +59,7 @@ class _hdbscan_lsi_grid_sweep(grid_sweep):
  
          self.lsi_dim = lsi_dim
          self.jobtype = hdbscan_lsi_job
-        super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs)
+        super().__init__(self.jobtype, inpath, outpath, self.namer, [self.lsi_dim], *args, **kwargs)
  
  
      def namer(self, *args, **kwargs):
@@ -87,9 +87,9 @@ def run_hdbscan_lsi_grid_sweep(savefile, inpath, outpath,  min_cluster_sizes=[2]
      obj = hdbscan_lsi_grid_sweep(inpath,
                                   lsi_dimensions,
                                   outpath,
-                                 map(int,min_cluster_sizes),
-                                 map(int,min_samples),
-                                 map(float,cluster_selection_epsilons),
+                                 list(map(int,min_cluster_sizes)),
+                                 list(map(int,min_samples)),
+                                 list(map(float,cluster_selection_epsilons)),
                                   cluster_selection_methods
                                   )
  
diff --git a/clustering/kmeans_clustering_lsi.py b/clustering/kmeans_clustering_lsi.py

index 20d582bea3bb69187f8bde5be66d66d96d05a0c4..bb006f3c5c0829f5515fb955066b3faa69a61483 100644 (file)
--- a/clustering/kmeans_clustering_lsi.py
+++ b/clustering/kmeans_clustering_lsi.py
@@ -34,7 +34,7 @@ class _kmeans_lsi_grid_sweep(grid_sweep):
          print(kwargs)
          self.lsi_dim = lsi_dim
          self.jobtype = kmeans_lsi_job
-        super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs)
+        super().__init__(self.jobtype, inpath, outpath, self.namer, [self.lsi_dim], *args, **kwargs)
  
      def namer(self, *args, **kwargs):
          s = kmeans_grid_sweep.namer(self, *args[1:], **kwargs)
diff --git a/clustering/pick_best_clustering.py b/clustering/pick_best_clustering.py

index 91c443e28a7cc47d31787ddad0ac417ff7d98f55..c541d234e212b5c5876280c4e43b65a6e876987a 100644 (file)
--- a/clustering/pick_best_clustering.py
+++ b/clustering/pick_best_clustering.py
@@ -2,15 +2,15 @@ import fire
  import pandas as pd
  from pathlib import Path
  import shutil
-
-selection_data="/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/affinity/selection_data.csv"
+selection_data="/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/hdbscan/selection_data.csv"
  
  outpath = 'test_best.feather'
+min_clusters=50; max_isolates=5000; min_cluster_size=2
  
  # pick the best clustering according to silhouette score subject to contraints
-def pick_best_clustering(selection_data, output, min_clusters, max_isolates):
+def pick_best_clustering(selection_data, output, min_clusters, max_isolates, min_cluster_size):
      df = pd.read_csv(selection_data,index_col=0)
-    df = df.sort_values("silhouette_score")
+    df = df.sort_values("silhouette_score",ascending=False)
  
      # not sure I fixed the bug underlying this fully or not.
      df['n_isolates_str'] = df.n_isolates.str.strip("[]")
@@ -18,11 +18,10 @@ def pick_best_clustering(selection_data, output, min_clusters, max_isolates):
      df.loc[df.n_isolates_0,'n_isolates'] = 0
      df.loc[~df.n_isolates_0,'n_isolates'] = df.loc[~df.n_isolates_0].n_isolates_str.apply(lambda l: int(l))
      
-    best_cluster = df[(df.n_isolates <= max_isolates)&(df.n_clusters >= min_clusters)].iloc[df.shape[1]]
+    best_cluster = df[(df.n_isolates <= max_isolates)&(df.n_clusters >= min_clusters)&(df.min_cluster_size==min_cluster_size)].iloc[df.shape[1]]
  
      print(best_cluster.to_dict())
      best_path = Path(best_cluster.outpath) / (str(best_cluster['name']) + ".feather")
-    
      shutil.copy(best_path,output)
  
  if __name__ == "__main__":
diff --git a/clustering/selection.py b/clustering/selection.py

index d2fa6de60ccd58ebc41d68985bd1708e5339ca54..81641db00155389739634075dcb413946da8672c 100644 (file)
--- a/clustering/selection.py
+++ b/clustering/selection.py
@@ -1,7 +1,38 @@
-import fire
-from select_affinity import select_affinity_clustering
-from select_kmeans import select_kmeans_clustering
+import pandas as pd
+import plotnine as pn
+from pathlib import Path
+from clustering.fit_tsne import fit_tsne
+from visualization.tsne_vis import build_visualization
+
+df = pd.read_csv("/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/hdbscan/selection_data.csv",index_col=0)
+
+# plot silhouette_score as a function of isolates
+df = df.sort_values("silhouette_score")
+
+df['n_isolates'] = df.n_isolates.str.split("\n0").apply(lambda rg: int(rg[1]))
+p = pn.ggplot(df,pn.aes(x='n_isolates',y='silhouette_score')) + pn.geom_point()
+p.save("isolates_x_score.png")
+
+p = pn.ggplot(df,pn.aes(y='n_clusters',x='n_isolates',color='silhouette_score')) + pn.geom_point()
+p.save("clusters_x_isolates.png")
+
+# the best result for hdbscan seems like this one: it has a decent number of 
+# i think I prefer the 'eom' clustering style because larger clusters are less likely to suffer from ommitted variables
+best_eom = df[(df.n_isolates <5000)&(df.silhouette_score>0.4)&(df.cluster_selection_method=='eom')&(df.min_cluster_size==2)].iloc[df.shape[1]]
+
+best_lsi = df[(df.n_isolates <5000)&(df.silhouette_score>0.4)&(df.cluster_selection_method=='leaf')&(df.min_cluster_size==2)].iloc[df.shape[1]]
+
+tsne_data = Path("./clustering/authors-tf_lsi850_tsne.feather")
+
+if not tnse_data.exists():
+    fit_tsne("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather",
+             tnse_data)
+
+build_visualization("./clustering/authors-tf_lsi850_tsne.feather",
+                    Path(best_eom.outpath)/(best_eom['name']+'.feather'),
+                    "./authors-tf_lsi850_best_eom.html")
+
+build_visualization("./clustering/authors-tf_lsi850_tsne.feather",
+                    Path(best_leaf.outpath)/(best_leaf['name']+'.feather'),
+                    "./authors-tf_lsi850_best_leaf.html")
  
-if __name__ == "__main__":
-    fire.Fire({"kmeans":select_kmeans_clustering,
-               "affinity":select_affinity_clustering})
diff --git a/density/Makefile b/density/Makefile

index d22339976a7a29858df2ddfb3556c5654b3762b1..90eba821894a76142aa143da8de8e5ca101d2fe5 100644 (file)
--- a/density/Makefile
+++ b/density/Makefile
@@ -8,3 +8,9 @@ all: /gscratch/comdata/output/reddit_density/comment_terms_10000.feather /gscrat
  
  /gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet
         start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather" --agg=pd.DataFrame.sum
+
+/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/850.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather
+       start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/850.feather" --agg=pd.DataFrame.sum
+
+/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather
+       start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather" --agg=pd.DataFrame.sum
diff --git a/density/job_script.sh b/density/job_script.sh

index 7dfac144d9612cd5890059e1724ec76241078d27..e411ba78066618e5dcde1ce0029d719a48deff6c 100755 (executable)
--- a/density/job_script.sh
+++ b/density/job_script.sh
@@ -1,4 +1,4 @@
  #!/usr/bin/bash
  start_spark_cluster.sh
-spark-submit --master spark://$(hostname):18899 overlap_density.py authors --inpath=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather --outpath=/gscratch/comdata/output/reddit_density/comment_authors_10000.feather --agg=pd.DataFrame.sum
-stop-all.sh
+singularity exec  /gscratch/comdata/users/nathante/cdsc_base.sif spark-submit --master spark://$(hostname).hyak.local:7077 overlap_density.py authors --inpath=/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather --outpath=/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather --agg=pd.DataFrame.sum
+singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif stop-all.sh
diff --git a/density/overlap_density.py b/density/overlap_density.py

index 5a8e91aee37251ecb37e4978eef5b01968184f34..20368249cd72c210a91e5d639213ce6edba6feef 100644 (file)
--- a/density/overlap_density.py
+++ b/density/overlap_density.py
@@ -1,11 +1,12 @@
  import pandas as pd
  from pandas.core.groupby import DataFrameGroupBy as GroupBy
+from pathlib import Path
  import fire
  import numpy as np
  import sys
  sys.path.append("..")
  sys.path.append("../similarities")
-from similarities.similarities_helper import reindex_tfidf, reindex_tfidf_time_interval
+from similarities.similarities_helper import reindex_tfidf
  
  # this is the mean of the ratio of the overlap to the focal size.
  # mean shared membership per focal community member
@@ -13,10 +14,12 @@ from similarities.similarities_helper import reindex_tfidf, reindex_tfidf_time_i
  
  def overlap_density(inpath, outpath, agg = pd.DataFrame.sum):
      df = pd.read_feather(inpath)
-    df = df.drop('subreddit',1)
+    df = df.drop('_subreddit',1)
      np.fill_diagonal(df.values,0)
      df = agg(df, 0).reset_index()
      df = df.rename({0:'overlap_density'},axis='columns')
+    outpath = Path(outpath)
+    outpath.parent.mkdir(parents=True, exist_ok = True)
      df.to_feather(outpath)
      return df
  
@@ -25,6 +28,8 @@ def overlap_density_weekly(inpath, outpath, agg = GroupBy.sum):
      # exclude the diagonal
      df = df.loc[df.subreddit != df.variable]
      res = agg(df.groupby(['subreddit','week'])).reset_index()
+    outpath = Path(outpath)
+    outpath.parent.mkdir(parents=True, exist_ok = True)
      res.to_feather(outpath)
      return res
  
diff --git a/dumps/pull_pushshift_comments.sh b/dumps/pull_pushshift_comments.sh

index 3f6d2c91b151712fcbca93ff67f9a03524b78774..40d82d817356cdb61ee0d7be3304daa1d158bcb6 100755 (executable)
--- a/dumps/pull_pushshift_comments.sh
+++ b/dumps/pull_pushshift_comments.sh
@@ -8,7 +8,5 @@ wget -r --no-parent -A 'RC_201*.bz2' -U $user_agent -P $output_dir -nd -nc $base
  wget -r --no-parent -A 'RC_201*.xz' -U $user_agent -P $output_dir -nd -nc $base_url
  wget -r --no-parent -A 'RC_201*.zst' -U $user_agent -P $output_dir -nd -nc $base_url
  
-# starting in 2020 we use daily dumps not monthly dumps
-wget -r --no-parent -A 'RC_202*.gz' -U $user_agent -P $output_dir -nd -nc $base_url/daily/
  
  ./check_comments_shas.py
diff --git a/similarities/cosine_similarities.py b/similarities/cosine_similarities.py

index 98f14544c218ca912633a25350712222c23c9d85..b9bab17b44a97a78070bd17f7cfc1dad407d0130 100644 (file)
--- a/similarities/cosine_similarities.py
+++ b/similarities/cosine_similarities.py
@@ -4,44 +4,49 @@ from pathlib import Path
  from similarities_helper import similarities, column_similarities
  from functools import partial
  
-def cosine_similarities(infile, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, from_date=None, to_date=None, tfidf_colname='tf_idf'):
+def cosine_similarities(infile, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None, tfidf_colname='tf_idf'):
  
-    return similarities(inpath=infile, simfunc=column_similarities, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname)
+    return similarities(infile=infile, simfunc=column_similarities, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=exclude_phrases,from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname)
  
  # change so that these take in an input as an optional argument (for speed, but also for idf).
  def term_cosine_similarities(outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None):
  
-    return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet',
+def term_cosine_similarities(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet', min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None):
+
+    return cosine_similarities(infile,
                                 'term',
                                 outfile,
                                 min_df,
                                 max_df,
                                 included_subreddits,
                                 topN,
+                               exclude_phrases,
                                 from_date,
                                 to_date
                                 )
  
-def author_cosine_similarities(outfile, min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None):
-    return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet',
+def author_cosine_similarities(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet', min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None):
+    return cosine_similarities(infile,
                                 'author',
                                 outfile,
                                 min_df,
                                 max_df,
                                 included_subreddits,
                                 topN,
+                               exclude_phrases=False,
                                 from_date=from_date,
                                 to_date=to_date
                                 )
  
-def author_tf_similarities(outfile, min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None):
-    return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet',
+def author_tf_similarities(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet', min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None):
+    return cosine_similarities(infile,
                                 'author',
                                 outfile,
                                 min_df,
                                 max_df,
                                 included_subreddits,
                                 topN,
+                               exclude_phrases=False,
                                 from_date=from_date,
                                 to_date=to_date,
                                 tfidf_colname='relative_tf'
diff --git a/similarities/job_script.sh b/similarities/job_script.sh

index 1f363cde91df098695370dc63d8c3fde9fef66ba..0c37103e2735c8af4a169c75a234ec4f2ea1ed96 100755 (executable)
--- a/similarities/job_script.sh
+++ b/similarities/job_script.sh
@@ -1,4 +1,4 @@
  #!/usr/bin/bash
  start_spark_cluster.sh
-singularity exec  /gscratch/comdata/users/nathante/cdsc_base.sif spark-submit --master spark://$(hostname).hyak.local:7077 lsi_similarities.py author --outfile=/gscratch/comdata/output//reddit_similarity/subreddit_comment_authors_10k_LSI.feather --topN=10000
+singularity exec  /gscratch/comdata/users/nathante/cdsc_base.sif spark-submit --master spark://$(hostname):7077 top_subreddits_by_comments.py 
  singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif stop-all.sh
diff --git a/similarities/similarities_helper.py b/similarities/similarities_helper.py

index a4983b38ef4ca6d3bb248631ce6e3d8cb7340276..13845d155200d04cb270308c6f61ef924900bdc2 100644 (file)
--- a/similarities/similarities_helper.py
+++ b/similarities/similarities_helper.py
@@ -97,6 +97,7 @@ def _pull_or_reindex_tfidf(infile, term_colname, min_df=None, max_df=None, inclu
              'relative_tf':ds.field('relative_tf').cast('float32'),
              'tf_idf':ds.field('tf_idf').cast('float32')}
  
+
      df = tfidf_ds.to_table(filter=ds_filter,columns=projection)
  
      df = df.to_pandas(split_blocks=True,self_destruct=True)
@@ -124,6 +125,17 @@ def _pull_or_reindex_tfidf(infile, term_colname, min_df=None, max_df=None, inclu
  
      return (df, tfidf_ds, ds_filter)
  
+    with Pool(cpu_count()) as pool:
+        chunks = pool.imap_unordered(pull_names,batches) 
+        subreddit_names = pd.concat(chunks,copy=False).drop_duplicates()
+
+    subreddit_names = subreddit_names.set_index("subreddit_id")
+    new_ids = df.loc[:,['subreddit_id','subreddit_id_new']].drop_duplicates()
+    new_ids = new_ids.set_index('subreddit_id')
+    subreddit_names = subreddit_names.join(new_ids,on='subreddit_id').reset_index()
+    subreddit_names = subreddit_names.drop("subreddit_id",1)
+    subreddit_names = subreddit_names.sort_values("subreddit_id_new")
+    return(df, subreddit_names)
  
  def pull_names(batch):
      return(batch.to_pandas().drop_duplicates())
@@ -165,7 +177,6 @@ def similarities(inpath, simfunc, term_colname, outfile, min_df=None, max_df=Non
  
      print(f'computing similarities on mat. mat.shape:{mat.shape}')
      print(f"size of mat is:{mat.data.nbytes}",flush=True)
-    # transform this to debug term tfidf
      sims = simfunc(mat)
      del mat
  
@@ -256,13 +267,12 @@ def lsi_column_similarities(tfidfmat,n_components=300,n_iter=10,random_state=196
              yield (sims, n_dims)
          else:
              return sims
+    
  
  def column_similarities(mat):
      return 1 - pairwise_distances(mat,metric='cosine')
  
-# need to rewrite this so that subreddit ids and term ids are fixed over the whole thing.
-# this affords taking the LSI similarities.
-# fill all 0s if we don't have it.
+
  def build_weekly_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm05):
      term = term_colname
      term_id = term + '_id'
@@ -295,7 +305,6 @@ def build_weekly_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weig
      subreddits = df.select(['subreddit']).distinct()
      subreddits = subreddits.withColumn('subreddit_id',f.row_number().over(Window.orderBy("subreddit")))
  
-    # df = df.cache()
      df = df.join(subreddits,on=['subreddit'])
  
      # map terms to indexes in the tfs and the idfs
diff --git a/similarities/tfidf.py b/similarities/tfidf.py

index 94dcbf59e7e2171552bd219a14a1a2373d6b19a3..19d30138457843df62ef440d3a75acc45b41df87 100644 (file)
--- a/similarities/tfidf.py
+++ b/similarities/tfidf.py
@@ -52,7 +52,7 @@ def tfidf_terms(outpath='/gscratch/comdata/output/reddit_similarity/tfidf/commen
  
  def tfidf_authors_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet',
                           topN=None,
-                         include_subreddits=None):
+                         included_subreddits=None):
  
      return tfidf_weekly("/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet",
                          outpath,
@@ -63,7 +63,8 @@ def tfidf_authors_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfi
                          )
  
  def tfidf_terms_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet',
-                       topN=25000):
+                       topN=None,
+                       included_subreddits=None):
  
  
      return tfidf_weekly("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet",
@@ -71,7 +72,7 @@ def tfidf_terms_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf
                          topN,
                          'term',
                          [],
-                        included_subreddits=None
+                        included_subreddits=included_subreddits
                          )
  
  
diff --git a/similarities/top_subreddits_by_comments.py b/similarities/top_subreddits_by_comments.py

index 1197b512a9063904e7a566c1def394a28e52a519..ff9293c209f1f86ecdd0c34a4f282c2cae8eb08c 100644 (file)
--- a/similarities/top_subreddits_by_comments.py
+++ b/similarities/top_subreddits_by_comments.py
@@ -17,7 +17,7 @@ df = df.filter(~df.subreddit.like("u_%"))
  df = df.groupBy('subreddit').agg(f.count('id').alias("n_comments"))
  
  df = df.join(prop_nsfw,on='subreddit')
-df = df.filter(df.prop_nsfw < 0.5)
+#df = df.filter(df.prop_nsfw < 0.5)
  
  win = Window.orderBy(f.col('n_comments').desc())
  df = df.withColumn('comments_rank', f.rank().over(win))
@@ -26,4 +26,4 @@ df = df.toPandas()
  
  df = df.sort_values("n_comments")
  
-df.to_csv('/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv', index=False)
+df.to_csv('/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments_nsfw.csv', index=False)
author	Nathan TeBlunthuis <nathante@uw.edu>
	Tue, 3 Aug 2021 22:13:21 +0000 (15:13 -0700)
committer	Nathan TeBlunthuis <nathante@uw.edu>
	Tue, 3 Aug 2021 22:13:21 +0000 (15:13 -0700)
clustering/Makefile		patch \| blob \| history
clustering/grid_sweep.py		patch \| blob \| history
clustering/hdbscan_clustering_lsi.py		patch \| blob \| history
clustering/kmeans_clustering_lsi.py		patch \| blob \| history
clustering/pick_best_clustering.py		patch \| blob \| history
clustering/selection.py		patch \| blob \| history
density/Makefile		patch \| blob \| history
density/job_script.sh		patch \| blob \| history
density/overlap_density.py		patch \| blob \| history
dumps/pull_pushshift_comments.sh		patch \| blob \| history
similarities/cosine_similarities.py		patch \| blob \| history
similarities/job_script.sh		patch \| blob \| history
similarities/similarities_helper.py		patch \| blob \| history
similarities/tfidf.py		patch \| blob \| history
similarities/top_subreddits_by_comments.py		patch \| blob \| history