Updates to similarities code for smap project.

[cdsc_reddit.git] / clustering / hdbscan_clustering.py
diff --git a/clustering/hdbscan_clustering.py b/clustering/hdbscan_clustering.py

index 4f4e0d6f2c4f18b47d3d96ac0991fbc72fdb6aef..e533808826043f93a545e507ef1b9093ba47657d 100644 (file)
--- a/clustering/hdbscan_clustering.py
+++ b/clustering/hdbscan_clustering.py
@@ -1,39 +1,57 @@
-from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat
+from clustering_base import clustering_result, clustering_job
+from grid_sweep import grid_sweep
  from dataclasses import dataclass
  import hdbscan
  from sklearn.neighbors import NearestNeighbors
  import plotnine as pn
  import numpy as np
  from dataclasses import dataclass
  import hdbscan
  from sklearn.neighbors import NearestNeighbors
  import plotnine as pn
  import numpy as np
-from itertools import product, starmap
+from itertools import product, starmap, chain
  import pandas as pd
  import pandas as pd
-from sklearn.metrics import silhouette_score, silhouette_samples
-from pathlib import Path
-from multiprocessing import Pool, cpu_count
+from multiprocessing import cpu_count
  import fire
  import fire
-from pyarrow.feather import write_feather
  
  def test_select_hdbscan_clustering():
  
  def test_select_hdbscan_clustering():
-    select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
-                              "test_hdbscan_author30k",
-                              min_cluster_sizes=[2],
-                              min_samples=[1,2],
-                              cluster_selection_epsilons=[0,0.05,0.1,0.15],
-                              cluster_selection_methods=['eom','leaf'],
-                              lsi_dimensions='all')
-    inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI"
+    # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
+    #                           "test_hdbscan_author30k",
+    #                           min_cluster_sizes=[2],
+    #                           min_samples=[1,2],
+    #                           cluster_selection_epsilons=[0,0.05,0.1,0.15],
+    #                           cluster_selection_methods=['eom','leaf'],
+    #                           lsi_dimensions='all')
+    inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/"
      outpath = "test_hdbscan";
      min_cluster_sizes=[2,3,4];
      min_samples=[1,2,3];
      cluster_selection_epsilons=[0,0.1,0.3,0.5];
      cluster_selection_methods=['eom'];
      lsi_dimensions='all'
      outpath = "test_hdbscan";
      min_cluster_sizes=[2,3,4];
      min_samples=[1,2,3];
      cluster_selection_epsilons=[0,0.1,0.3,0.5];
      cluster_selection_methods=['eom'];
      lsi_dimensions='all'
-
-    df = pd.read_csv("test_hdbscan/selection_data.csv")
-    test_select_hdbscan_clustering()
-    check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather")
-    silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather")
-    c = check_clusters.merge(silscores,on='subreddit')#    fire.Fire(select_hdbscan_clustering)
-
+    gs = hdbscan_lsi_grid_sweep(inpath, "all", outpath, min_cluster_sizes, min_samples, cluster_selection_epsilons, cluster_selection_methods)
+    gs.run(20)
+    gs.save("test_hdbscan/lsi_sweep.csv")
+    # job1 = hdbscan_lsi_job(infile=inpath, outpath=outpath, name="test", lsi_dims=500, min_cluster_size=2, min_samples=1,cluster_selection_epsilon=0,cluster_selection_method='eom')
+    # job1.run()
+    # print(job1.get_info())
+
+    # df = pd.read_csv("test_hdbscan/selection_data.csv")
+    # test_select_hdbscan_clustering()
+    # check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather")
+    # silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather")
+    # c = check_clusters.merge(silscores,on='subreddit')#    fire.Fire(select_hdbscan_clustering)
+class hdbscan_grid_sweep(grid_sweep):
+    def __init__(self,
+                 inpath,
+                 outpath,
+                 *args,
+                 **kwargs):
+
+        super().__init__(hdbscan_job, inpath, outpath, self.namer, *args, **kwargs)
+
+    def namer(self,
+              min_cluster_size,
+              min_samples,
+              cluster_selection_epsilon,
+              cluster_selection_method):
+        return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}"
  
  @dataclass
  class hdbscan_clustering_result(clustering_result):
  
  @dataclass
  class hdbscan_clustering_result(clustering_result):
@@ -41,107 +59,70 @@ class hdbscan_clustering_result(clustering_result):
      min_samples:int
      cluster_selection_epsilon:float
      cluster_selection_method:str
      min_samples:int
      cluster_selection_epsilon:float
      cluster_selection_method:str
-    lsi_dimensions:int
-    n_isolates:int
-    silhouette_samples:str
-
-def select_hdbscan_clustering(inpath,
-                              outpath,
-                              outfile=None,
-                              min_cluster_sizes=[2],
-                              min_samples=[1],
-                              cluster_selection_epsilons=[0],
-                              cluster_selection_methods=['eom'],
-                              lsi_dimensions='all'
-                              ):
-
-    inpath = Path(inpath)
-    outpath = Path(outpath)
-    outpath.mkdir(exist_ok=True, parents=True)
-    
-    if lsi_dimensions == 'all':
-        lsi_paths = list(inpath.glob("*"))
-
-    else:
-        lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions]
-
-    lsi_nums = [p.stem for p in lsi_paths]
-    grid = list(product(lsi_nums,
-                        min_cluster_sizes,
-                        min_samples,
-                        cluster_selection_epsilons,
-                        cluster_selection_methods))
-
-    # fix the output file names
-    names = list(map(lambda t:'_'.join(map(str,t)),grid))
-
-    grid = [(inpath/(str(t[0])+'.feather'),outpath/(name + '.feather'), t[0], name) + t[1:] for t, name in zip(grid, names)]
-        
-    with Pool(int(cpu_count()/4)) as pool:
-        mods = starmap(hdbscan_clustering, grid)
  
  
-    res = pd.DataFrame(mods)
-    if outfile is None:
-        outfile = outpath / "selection_data.csv"
-
-    res.to_csv(outfile)
-
-def hdbscan_clustering(similarities, output, lsi_dim, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
-    subreddits, mat = read_similarity_mat(similarities)
-    mat = sim_to_dist(mat)
-    clustering = _hdbscan_clustering(mat,
-                                     min_cluster_size=min_cluster_size,
-                                     min_samples=min_samples,
-                                     cluster_selection_epsilon=cluster_selection_epsilon,
-                                     cluster_selection_method=cluster_selection_method,
-                                     metric='precomputed',
-                                     core_dist_n_jobs=cpu_count()
-                                     )
-
-    cluster_data = process_clustering_result(clustering, subreddits)
-    isolates = clustering.labels_ == -1
-    scoremat = mat[~isolates][:,~isolates]
-    score = silhouette_score(scoremat, clustering.labels_[~isolates], metric='precomputed')
-    cluster_data.to_feather(output)
-
-    silhouette_samp = silhouette_samples(mat, clustering.labels_, metric='precomputed')
-    silhouette_samp = pd.DataFrame({'subreddit':subreddits,'score':silhouette_samp})
-    silsampout = output.parent / ("silhouette_samples" + output.name)
-    silhouette_samp.to_feather(silsampout)
-
-    result = hdbscan_clustering_result(outpath=output,
-                                       max_iter=None,
-                                       silhouette_samples=silsampout,
-                                       silhouette_score=score,
-                                       alt_silhouette_score=score,
-                                       name=name,
-                                       min_cluster_size=min_cluster_size,
-                                       min_samples=min_samples,
-                                       cluster_selection_epsilon=cluster_selection_epsilon,
-                                       cluster_selection_method=cluster_selection_method,
-                                       lsi_dimensions=lsi_dim,
-                                       n_isolates=isolates.sum(),
-                                       n_clusters=len(set(clustering.labels_))
-                                   )
-
-
-                                       
-    return(result)
-
-# for all runs we should try cluster_selection_epsilon = None
-# for terms we should try cluster_selection_epsilon around 0.56-0.66
-# for authors we should try cluster_selection_epsilon around 0.98-0.99
-def _hdbscan_clustering(mat, *args, **kwargs):
-    print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}")
-
-    print(mat)
-    clusterer = hdbscan.HDBSCAN(*args,
-                                **kwargs,
-                                )
+class hdbscan_job(clustering_job):
+    def __init__(self, infile, outpath, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
+        super().__init__(infile,
+                         outpath,
+                         name,
+                         call=hdbscan_job._hdbscan_clustering,
+                         min_cluster_size=min_cluster_size,
+                         min_samples=min_samples,
+                         cluster_selection_epsilon=cluster_selection_epsilon,
+                         cluster_selection_method=cluster_selection_method
+                         )
+
+        self.min_cluster_size = min_cluster_size
+        self.min_samples = min_samples
+        self.cluster_selection_epsilon = cluster_selection_epsilon
+        self.cluster_selection_method = cluster_selection_method
+#        self.mat = 1 - self.mat
+
+    def _hdbscan_clustering(mat, *args, **kwargs):
+        print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}")
+        print(mat)
+        clusterer = hdbscan.HDBSCAN(metric='precomputed',
+                                    core_dist_n_jobs=cpu_count(),
+                                    *args,
+                                    **kwargs,
+                                    )
+    
+        clustering = clusterer.fit(mat.astype('double'))
      
      
-    clustering = clusterer.fit(mat.astype('double'))
+        return(clustering)
+
+    def get_info(self):
+        result = super().get_info()
+        self.result = hdbscan_clustering_result(**result.__dict__,
+                                                min_cluster_size=self.min_cluster_size,
+                                                min_samples=self.min_samples,
+                                                cluster_selection_epsilon=self.cluster_selection_epsilon,
+                                                cluster_selection_method=self.cluster_selection_method)
+        return self.result
+
+def run_hdbscan_grid_sweep(savefile, inpath, outpath,  min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom']):
+    """Run hdbscan clustering once or more with different parameters.
      
      
-    return(clustering)
+    Usage:
+    hdbscan_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --min_cluster_sizes=<csv> --min_samples=<csv> --cluster_selection_epsilons=<csv> --cluster_selection_methods=<csv "eom"|"leaf">
+
+    Keword arguments:
+    savefile: path to save the metadata and diagnostics 
+    inpath: path to feather data containing a labeled matrix of subreddit similarities.
+    outpath: path to output fit kmeans clusterings.
+    min_cluster_sizes: one or more integers indicating the minumum cluster size
+    min_samples: one ore more integers indicating the minimum number of samples used in the algorithm
+    cluster_selection_epsilon: one or more similarity thresholds for transition from dbscan to hdbscan
+    cluster_selection_method: "eom" or "leaf" eom gives larger clusters. 
+    """    
+    obj = hdbscan_grid_sweep(inpath,
+                             outpath,
+                             map(int,min_cluster_sizes),
+                             map(int,min_samples),
+                             map(float,cluster_selection_epsilons),
+                             map(float,cluster_selection_methods))
+    obj.run()
+    obj.save(savefile)
  
  def KNN_distances_plot(mat,outname,k=2):
      nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat)
  
  def KNN_distances_plot(mat,outname,k=2):
      nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat)
@@ -172,4 +153,7 @@ def make_KNN_plots():
      KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png')
  
  if __name__ == "__main__":
      KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png')
  
  if __name__ == "__main__":
-    fire.Fire(select_hdbscan_clustering)
+    fire.Fire(run_hdbscan_grid_sweep)
+    
+#    test_select_hdbscan_clustering()
+    #fire.Fire(select_hdbscan_clustering)