script for picking the best clustering given constraints

[cdsc_reddit.git] / clustering / hdbscan_clustering.py
diff --git a/clustering/hdbscan_clustering.py b/clustering/hdbscan_clustering.py

index f0ee7038c75bab4df7960f5d49b68518aac85345..e533808826043f93a545e507ef1b9093ba47657d 100644 (file)
--- a/clustering/hdbscan_clustering.py
+++ b/clustering/hdbscan_clustering.py
@@ -1,5 +1,5 @@
-from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat
-from clustering_base import lsi_result_mixin, lsi_mixin, clustering_job, grid_sweep, lsi_grid_sweep
+from clustering_base import clustering_result, clustering_job
+from grid_sweep import grid_sweep
  from dataclasses import dataclass
  import hdbscan
  from sklearn.neighbors import NearestNeighbors
  from dataclasses import dataclass
  import hdbscan
  from sklearn.neighbors import NearestNeighbors
@@ -7,11 +7,8 @@ import plotnine as pn
  import numpy as np
  from itertools import product, starmap, chain
  import pandas as pd
  import numpy as np
  from itertools import product, starmap, chain
  import pandas as pd
-from sklearn.metrics import silhouette_score, silhouette_samples
-from pathlib import Path
-from multiprocessing import Pool, cpu_count
+from multiprocessing import cpu_count
  import fire
  import fire
-from pyarrow.feather import write_feather
  
  def test_select_hdbscan_clustering():
      # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
  
  def test_select_hdbscan_clustering():
      # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
@@ -40,28 +37,6 @@ def test_select_hdbscan_clustering():
      # check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather")
      # silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather")
      # c = check_clusters.merge(silscores,on='subreddit')#    fire.Fire(select_hdbscan_clustering)
      # check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather")
      # silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather")
      # c = check_clusters.merge(silscores,on='subreddit')#    fire.Fire(select_hdbscan_clustering)
-
-class hdbscan_lsi_grid_sweep(lsi_grid_sweep):
-    def __init__(self,
-                 inpath,
-                 lsi_dims,
-                 outpath,
-                 min_cluster_sizes,
-                 min_samples,
-                 cluster_selection_epsilons,
-                 cluster_selection_methods
-                 ):
-
-        super().__init__(hdbscan_lsi_job,
-                         _hdbscan_lsi_grid_sweep,
-                         inpath,
-                         lsi_dims,
-                         outpath,
-                         min_cluster_sizes,
-                         min_samples,
-                         cluster_selection_epsilons,
-                         cluster_selection_methods)
-        
  class hdbscan_grid_sweep(grid_sweep):
      def __init__(self,
                   inpath,
  class hdbscan_grid_sweep(grid_sweep):
      def __init__(self,
                   inpath,
@@ -78,25 +53,6 @@ class hdbscan_grid_sweep(grid_sweep):
                cluster_selection_method):
          return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}"
  
                cluster_selection_method):
          return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}"
  
-
-class _hdbscan_lsi_grid_sweep(grid_sweep):
-    def __init__(self,
-                 inpath,
-                 outpath,
-                 lsi_dim,
-                 *args,
-                 **kwargs):
-
-        self.lsi_dim = lsi_dim
-        self.jobtype = hdbscan_lsi_job
-        super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs)
-
-
-    def namer(self, *args, **kwargs):
-        s = hdbscan_grid_sweep.namer(self, *args[1:], **kwargs)
-        s += f"_lsi-{self.lsi_dim}"
-        return s
-
  @dataclass
  class hdbscan_clustering_result(clustering_result):
      min_cluster_size:int
  @dataclass
  class hdbscan_clustering_result(clustering_result):
      min_cluster_size:int
@@ -104,10 +60,6 @@ class hdbscan_clustering_result(clustering_result):
      cluster_selection_epsilon:float
      cluster_selection_method:str
  
      cluster_selection_epsilon:float
      cluster_selection_method:str
  
-@dataclass
-class hdbscan_clustering_result_lsi(hdbscan_clustering_result, lsi_result_mixin):
-    pass 
-
  class hdbscan_job(clustering_job):
      def __init__(self, infile, outpath, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
          super().__init__(infile,
  class hdbscan_job(clustering_job):
      def __init__(self, infile, outpath, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
          super().__init__(infile,
@@ -148,121 +100,29 @@ class hdbscan_job(clustering_job):
                                                  cluster_selection_method=self.cluster_selection_method)
          return self.result
  
                                                  cluster_selection_method=self.cluster_selection_method)
          return self.result
  
-class hdbscan_lsi_job(hdbscan_job, lsi_mixin):
-    def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
-        super().__init__(
-                         infile,
-                         outpath,
-                         name,
-                         *args,
-                         **kwargs)
-        super().set_lsi_dims(lsi_dims)
-
-    def get_info(self):
-        partial_result = super().get_info()
-        self.result = hdbscan_clustering_result_lsi(**partial_result.__dict__,
-                                                    lsi_dimensions=self.lsi_dims)
-        return self.result
-
-# def select_hdbscan_clustering(inpath,
-#                               outpath,
-#                               outfile=None,
-#                               min_cluster_sizes=[2],
-#                               min_samples=[1],
-#                               cluster_selection_epsilons=[0],
-#                               cluster_selection_methods=['eom'],
-#                               lsi_dimensions='all'
-#                               ):
-
-#     inpath = Path(inpath)
-#     outpath = Path(outpath)
-#     outpath.mkdir(exist_ok=True, parents=True)
-    
-#     if lsi_dimensions is None:
-#         lsi_paths = [inpath]
-#     elif lsi_dimensions == 'all':
-#         lsi_paths = list(inpath.glob("*"))
-
-#     else:
-#         lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions]
-
-#     if lsi_dimensions is not None:
-#         lsi_nums = [p.stem for p in lsi_paths]
-#     else:
-#         lsi_nums = [None]
-#     grid = list(product(lsi_nums,
-#                         min_cluster_sizes,
-#                         min_samples,
-#                         cluster_selection_epsilons,
-#                         cluster_selection_methods))
-
-#     # fix the output file names
-#     names = list(map(lambda t:'_'.join(map(str,t)),grid))
-
-#     grid = [(inpath/(str(t[0])+'.feather'),outpath/(name + '.feather'), t[0], name) + t[1:] for t, name in zip(grid, names)]
-        
-#     with Pool(int(cpu_count()/4)) as pool:
-#         mods = starmap(hdbscan_clustering, grid)
-
-#     res = pd.DataFrame(mods)
-#     if outfile is None:
-#         outfile = outpath / "selection_data.csv"
-
-#     res.to_csv(outfile)
-
-# def hdbscan_clustering(similarities, output, lsi_dim, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
-#     subreddits, mat = read_similarity_mat(similarities)
-#     mat = sim_to_dist(mat)
-#     clustering = _hdbscan_clustering(mat,
-#                                      min_cluster_size=min_cluster_size,
-#                                      min_samples=min_samples,
-#                                      cluster_selection_epsilon=cluster_selection_epsilon,
-#                                      cluster_selection_method=cluster_selection_method,
-#                                      metric='precomputed',
-#                                      core_dist_n_jobs=cpu_count()
-#                                      )
-
-#     cluster_data = process_clustering_result(clustering, subreddits)
-#     isolates = clustering.labels_ == -1
-#     scoremat = mat[~isolates][:,~isolates]
-#     score = silhouette_score(scoremat, clustering.labels_[~isolates], metric='precomputed')
-#     cluster_data.to_feather(output)
-#     silhouette_samp = silhouette_samples(mat, clustering.labels_, metric='precomputed')
-#     silhouette_samp = pd.DataFrame({'subreddit':subreddits,'score':silhouette_samp})
-#     silsampout = output.parent / ("silhouette_samples" + output.name)
-#     silhouette_samp.to_feather(silsampout)
-
-#     result = hdbscan_clustering_result(outpath=output,
-#                                        silhouette_samples=silsampout,
-#                                        silhouette_score=score,
-#                                        name=name,
-#                                        min_cluster_size=min_cluster_size,
-#                                        min_samples=min_samples,
-#                                        cluster_selection_epsilon=cluster_selection_epsilon,
-#                                        cluster_selection_method=cluster_selection_method,
-#                                        lsi_dimensions=lsi_dim,
-#                                        n_isolates=isolates.sum(),
-#                                        n_clusters=len(set(clustering.labels_))
-#                                    )
-
-
-                                       
-#     return(result)
-
-# # for all runs we should try cluster_selection_epsilon = None
-# # for terms we should try cluster_selection_epsilon around 0.56-0.66
-# # for authors we should try cluster_selection_epsilon around 0.98-0.99
-# def _hdbscan_clustering(mat, *args, **kwargs):
-#     print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}")
-
-#     print(mat)
-#     clusterer = hdbscan.HDBSCAN(*args,
-#                                 **kwargs,
-#                                 )
-    
-#     clustering = clusterer.fit(mat.astype('double'))
+def run_hdbscan_grid_sweep(savefile, inpath, outpath,  min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom']):
+    """Run hdbscan clustering once or more with different parameters.
      
      
-#     return(clustering)
+    Usage:
+    hdbscan_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --min_cluster_sizes=<csv> --min_samples=<csv> --cluster_selection_epsilons=<csv> --cluster_selection_methods=<csv "eom"|"leaf">
+
+    Keword arguments:
+    savefile: path to save the metadata and diagnostics 
+    inpath: path to feather data containing a labeled matrix of subreddit similarities.
+    outpath: path to output fit kmeans clusterings.
+    min_cluster_sizes: one or more integers indicating the minumum cluster size
+    min_samples: one ore more integers indicating the minimum number of samples used in the algorithm
+    cluster_selection_epsilon: one or more similarity thresholds for transition from dbscan to hdbscan
+    cluster_selection_method: "eom" or "leaf" eom gives larger clusters. 
+    """    
+    obj = hdbscan_grid_sweep(inpath,
+                             outpath,
+                             map(int,min_cluster_sizes),
+                             map(int,min_samples),
+                             map(float,cluster_selection_epsilons),
+                             map(float,cluster_selection_methods))
+    obj.run()
+    obj.save(savefile)
  
  def KNN_distances_plot(mat,outname,k=2):
      nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat)
  
  def KNN_distances_plot(mat,outname,k=2):
      nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat)
@@ -293,10 +153,7 @@ def make_KNN_plots():
      KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png')
  
  if __name__ == "__main__":
      KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png')
  
  if __name__ == "__main__":
-    fire.Fire{'grid_sweep':hdbscan_grid_sweep,
-              'grid_sweep_lsi':hdbscan_lsi_grid_sweep
-              'cluster':hdbscan_job,
-              'cluster_lsi':hdbscan_lsi_job}
+    fire.Fire(run_hdbscan_grid_sweep)
      
  #    test_select_hdbscan_clustering()
      #fire.Fire(select_hdbscan_clustering)  
      
  #    test_select_hdbscan_clustering()
      #fire.Fire(select_hdbscan_clustering)