]> code.communitydata.science - cdsc_reddit.git/blobdiff - clustering/hdbscan_clustering.py
Merge branch 'excise_reindex' of code:cdsc_reddit into excise_reindex
[cdsc_reddit.git] / clustering / hdbscan_clustering.py
index f0ee7038c75bab4df7960f5d49b68518aac85345..e533808826043f93a545e507ef1b9093ba47657d 100644 (file)
@@ -1,5 +1,5 @@
-from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat
-from clustering_base import lsi_result_mixin, lsi_mixin, clustering_job, grid_sweep, lsi_grid_sweep
+from clustering_base import clustering_result, clustering_job
+from grid_sweep import grid_sweep
 from dataclasses import dataclass
 import hdbscan
 from sklearn.neighbors import NearestNeighbors
 from dataclasses import dataclass
 import hdbscan
 from sklearn.neighbors import NearestNeighbors
@@ -7,11 +7,8 @@ import plotnine as pn
 import numpy as np
 from itertools import product, starmap, chain
 import pandas as pd
 import numpy as np
 from itertools import product, starmap, chain
 import pandas as pd
-from sklearn.metrics import silhouette_score, silhouette_samples
-from pathlib import Path
-from multiprocessing import Pool, cpu_count
+from multiprocessing import cpu_count
 import fire
 import fire
-from pyarrow.feather import write_feather
 
 def test_select_hdbscan_clustering():
     # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
 
 def test_select_hdbscan_clustering():
     # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
@@ -40,28 +37,6 @@ def test_select_hdbscan_clustering():
     # check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather")
     # silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather")
     # c = check_clusters.merge(silscores,on='subreddit')#    fire.Fire(select_hdbscan_clustering)
     # check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather")
     # silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather")
     # c = check_clusters.merge(silscores,on='subreddit')#    fire.Fire(select_hdbscan_clustering)
-
-class hdbscan_lsi_grid_sweep(lsi_grid_sweep):
-    def __init__(self,
-                 inpath,
-                 lsi_dims,
-                 outpath,
-                 min_cluster_sizes,
-                 min_samples,
-                 cluster_selection_epsilons,
-                 cluster_selection_methods
-                 ):
-
-        super().__init__(hdbscan_lsi_job,
-                         _hdbscan_lsi_grid_sweep,
-                         inpath,
-                         lsi_dims,
-                         outpath,
-                         min_cluster_sizes,
-                         min_samples,
-                         cluster_selection_epsilons,
-                         cluster_selection_methods)
-        
 class hdbscan_grid_sweep(grid_sweep):
     def __init__(self,
                  inpath,
 class hdbscan_grid_sweep(grid_sweep):
     def __init__(self,
                  inpath,
@@ -78,25 +53,6 @@ class hdbscan_grid_sweep(grid_sweep):
               cluster_selection_method):
         return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}"
 
               cluster_selection_method):
         return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}"
 
-
-class _hdbscan_lsi_grid_sweep(grid_sweep):
-    def __init__(self,
-                 inpath,
-                 outpath,
-                 lsi_dim,
-                 *args,
-                 **kwargs):
-
-        self.lsi_dim = lsi_dim
-        self.jobtype = hdbscan_lsi_job
-        super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs)
-
-
-    def namer(self, *args, **kwargs):
-        s = hdbscan_grid_sweep.namer(self, *args[1:], **kwargs)
-        s += f"_lsi-{self.lsi_dim}"
-        return s
-
 @dataclass
 class hdbscan_clustering_result(clustering_result):
     min_cluster_size:int
 @dataclass
 class hdbscan_clustering_result(clustering_result):
     min_cluster_size:int
@@ -104,10 +60,6 @@ class hdbscan_clustering_result(clustering_result):
     cluster_selection_epsilon:float
     cluster_selection_method:str
 
     cluster_selection_epsilon:float
     cluster_selection_method:str
 
-@dataclass
-class hdbscan_clustering_result_lsi(hdbscan_clustering_result, lsi_result_mixin):
-    pass 
-
 class hdbscan_job(clustering_job):
     def __init__(self, infile, outpath, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
         super().__init__(infile,
 class hdbscan_job(clustering_job):
     def __init__(self, infile, outpath, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
         super().__init__(infile,
@@ -148,121 +100,29 @@ class hdbscan_job(clustering_job):
                                                 cluster_selection_method=self.cluster_selection_method)
         return self.result
 
                                                 cluster_selection_method=self.cluster_selection_method)
         return self.result
 
-class hdbscan_lsi_job(hdbscan_job, lsi_mixin):
-    def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
-        super().__init__(
-                         infile,
-                         outpath,
-                         name,
-                         *args,
-                         **kwargs)
-        super().set_lsi_dims(lsi_dims)
-
-    def get_info(self):
-        partial_result = super().get_info()
-        self.result = hdbscan_clustering_result_lsi(**partial_result.__dict__,
-                                                    lsi_dimensions=self.lsi_dims)
-        return self.result
-
-# def select_hdbscan_clustering(inpath,
-#                               outpath,
-#                               outfile=None,
-#                               min_cluster_sizes=[2],
-#                               min_samples=[1],
-#                               cluster_selection_epsilons=[0],
-#                               cluster_selection_methods=['eom'],
-#                               lsi_dimensions='all'
-#                               ):
-
-#     inpath = Path(inpath)
-#     outpath = Path(outpath)
-#     outpath.mkdir(exist_ok=True, parents=True)
-    
-#     if lsi_dimensions is None:
-#         lsi_paths = [inpath]
-#     elif lsi_dimensions == 'all':
-#         lsi_paths = list(inpath.glob("*"))
-
-#     else:
-#         lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions]
-
-#     if lsi_dimensions is not None:
-#         lsi_nums = [p.stem for p in lsi_paths]
-#     else:
-#         lsi_nums = [None]
-#     grid = list(product(lsi_nums,
-#                         min_cluster_sizes,
-#                         min_samples,
-#                         cluster_selection_epsilons,
-#                         cluster_selection_methods))
-
-#     # fix the output file names
-#     names = list(map(lambda t:'_'.join(map(str,t)),grid))
-
-#     grid = [(inpath/(str(t[0])+'.feather'),outpath/(name + '.feather'), t[0], name) + t[1:] for t, name in zip(grid, names)]
-        
-#     with Pool(int(cpu_count()/4)) as pool:
-#         mods = starmap(hdbscan_clustering, grid)
-
-#     res = pd.DataFrame(mods)
-#     if outfile is None:
-#         outfile = outpath / "selection_data.csv"
-
-#     res.to_csv(outfile)
-
-# def hdbscan_clustering(similarities, output, lsi_dim, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
-#     subreddits, mat = read_similarity_mat(similarities)
-#     mat = sim_to_dist(mat)
-#     clustering = _hdbscan_clustering(mat,
-#                                      min_cluster_size=min_cluster_size,
-#                                      min_samples=min_samples,
-#                                      cluster_selection_epsilon=cluster_selection_epsilon,
-#                                      cluster_selection_method=cluster_selection_method,
-#                                      metric='precomputed',
-#                                      core_dist_n_jobs=cpu_count()
-#                                      )
-
-#     cluster_data = process_clustering_result(clustering, subreddits)
-#     isolates = clustering.labels_ == -1
-#     scoremat = mat[~isolates][:,~isolates]
-#     score = silhouette_score(scoremat, clustering.labels_[~isolates], metric='precomputed')
-#     cluster_data.to_feather(output)
-#     silhouette_samp = silhouette_samples(mat, clustering.labels_, metric='precomputed')
-#     silhouette_samp = pd.DataFrame({'subreddit':subreddits,'score':silhouette_samp})
-#     silsampout = output.parent / ("silhouette_samples" + output.name)
-#     silhouette_samp.to_feather(silsampout)
-
-#     result = hdbscan_clustering_result(outpath=output,
-#                                        silhouette_samples=silsampout,
-#                                        silhouette_score=score,
-#                                        name=name,
-#                                        min_cluster_size=min_cluster_size,
-#                                        min_samples=min_samples,
-#                                        cluster_selection_epsilon=cluster_selection_epsilon,
-#                                        cluster_selection_method=cluster_selection_method,
-#                                        lsi_dimensions=lsi_dim,
-#                                        n_isolates=isolates.sum(),
-#                                        n_clusters=len(set(clustering.labels_))
-#                                    )
-
-
-                                       
-#     return(result)
-
-# # for all runs we should try cluster_selection_epsilon = None
-# # for terms we should try cluster_selection_epsilon around 0.56-0.66
-# # for authors we should try cluster_selection_epsilon around 0.98-0.99
-# def _hdbscan_clustering(mat, *args, **kwargs):
-#     print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}")
-
-#     print(mat)
-#     clusterer = hdbscan.HDBSCAN(*args,
-#                                 **kwargs,
-#                                 )
-    
-#     clustering = clusterer.fit(mat.astype('double'))
+def run_hdbscan_grid_sweep(savefile, inpath, outpath,  min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom']):
+    """Run hdbscan clustering once or more with different parameters.
     
     
-#     return(clustering)
+    Usage:
+    hdbscan_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --min_cluster_sizes=<csv> --min_samples=<csv> --cluster_selection_epsilons=<csv> --cluster_selection_methods=<csv "eom"|"leaf">
+
+    Keword arguments:
+    savefile: path to save the metadata and diagnostics 
+    inpath: path to feather data containing a labeled matrix of subreddit similarities.
+    outpath: path to output fit kmeans clusterings.
+    min_cluster_sizes: one or more integers indicating the minumum cluster size
+    min_samples: one ore more integers indicating the minimum number of samples used in the algorithm
+    cluster_selection_epsilon: one or more similarity thresholds for transition from dbscan to hdbscan
+    cluster_selection_method: "eom" or "leaf" eom gives larger clusters. 
+    """    
+    obj = hdbscan_grid_sweep(inpath,
+                             outpath,
+                             map(int,min_cluster_sizes),
+                             map(int,min_samples),
+                             map(float,cluster_selection_epsilons),
+                             map(float,cluster_selection_methods))
+    obj.run()
+    obj.save(savefile)
 
 def KNN_distances_plot(mat,outname,k=2):
     nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat)
 
 def KNN_distances_plot(mat,outname,k=2):
     nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat)
@@ -293,10 +153,7 @@ def make_KNN_plots():
     KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png')
 
 if __name__ == "__main__":
     KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png')
 
 if __name__ == "__main__":
-    fire.Fire{'grid_sweep':hdbscan_grid_sweep,
-              'grid_sweep_lsi':hdbscan_lsi_grid_sweep
-              'cluster':hdbscan_job,
-              'cluster_lsi':hdbscan_lsi_job}
+    fire.Fire(run_hdbscan_grid_sweep)
     
 #    test_select_hdbscan_clustering()
     #fire.Fire(select_hdbscan_clustering)  
     
 #    test_select_hdbscan_clustering()
     #fire.Fire(select_hdbscan_clustering)  

Community Data Science Collective || Want to submit a patch?