]> code.communitydata.science - cdsc_reddit.git/blob - clustering/umap_hdbscan_clustering_lsi.py
add support for umap->hdbscan clustering method
[cdsc_reddit.git] / clustering / umap_hdbscan_clustering_lsi.py
1 from umap_hdbscan_clustering import umap_hdbscan_job, umap_hdbscan_grid_sweep, umap_hdbscan_clustering_result
2 from lsi_base import twoway_lsi_grid_sweep, lsi_mixin, lsi_result_mixin
3 from grid_sweep import twoway_grid_sweep
4 import fire
5 from dataclasses import dataclass
6
7 @dataclass
8 class umap_hdbscan_clustering_result_lsi(umap_hdbscan_clustering_result, lsi_result_mixin):
9     pass 
10
11 class umap_hdbscan_lsi_job(umap_hdbscan_job, lsi_mixin):
12     def __init__(self, infile, outpath, name, umap_args, hdbscan_args, lsi_dims, save_step1=False):
13         super().__init__(
14             infile,
15             outpath,
16             name,
17             umap_args,
18             hdbscan_args,
19             save_step1
20         )
21         super().set_lsi_dims(lsi_dims)
22
23     def get_info(self):
24         partial_result = super().get_info()
25         self.result = umap_hdbscan_clustering_result_lsi(**partial_result.__dict__,
26                                                          lsi_dimensions=self.lsi_dims)
27         return self.result
28
29 class umap_hdbscan_lsi_grid_sweep(twoway_lsi_grid_sweep):
30     def __init__(self,
31                  inpath,
32                  lsi_dims,
33                  outpath,
34                  umap_args,
35                  hdbscan_args,
36                  save_step1
37                  ):
38
39         super().__init__(umap_hdbscan_lsi_job,
40                          _umap_hdbscan_lsi_grid_sweep,
41                          inpath,
42                          lsi_dims,
43                          outpath,
44                          umap_args,
45                          hdbscan_args,
46                          save_step1
47                          )
48         
49
50
51 class _umap_hdbscan_lsi_grid_sweep(twoway_grid_sweep):
52     def __init__(self,
53                  inpath,
54                  outpath,
55                  lsi_dim,
56                  umap_args,
57                  hdbscan_args,
58                  save_step1):
59
60         self.lsi_dim = lsi_dim
61         self.jobtype = umap_hdbscan_lsi_job
62         super().__init__(self.jobtype, inpath, outpath, self.namer, umap_args, hdbscan_args, save_step1, lsi_dim)
63
64
65     def namer(self, *args, **kwargs):
66         s = umap_hdbscan_grid_sweep.namer(self, *args, **kwargs)
67         s += f"_lsi-{self.lsi_dim}"
68         return s
69
70 def run_umap_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, n_neighbors = [15], learning_rate=[1], min_dist=[1], local_connectivity=[1],
71                                     min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom'], lsi_dimensions='all', save_step1 = False):
72     """Run hdbscan clustering once or more with different parameters.
73     
74     Usage:
75     hdbscan_clustering_lsi --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --min_cluster_sizes=<csv> --min_samples=<csv> --cluster_selection_epsilons=<csv> --cluster_selection_methods=[eom]> --lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
76
77     Keword arguments:
78     savefile: path to save the metadata and diagnostics 
79     inpath: path to folder containing feather files with LSI similarity labeled matrices of subreddit similarities.
80     outpath: path to output fit clusterings.
81     min_cluster_sizes: one or more integers indicating the minumum cluster size
82     min_samples: one ore more integers indicating the minimum number of samples used in the algorithm
83     cluster_selection_epsilons: one or more similarity thresholds for transition from dbscan to hdbscan
84     cluster_selection_methods: one or more of "eom" or "leaf" eom gives larger clusters. 
85     lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
86     """    
87
88
89     umap_args = {'n_neighbors':list(map(int, n_neighbors)),
90                  'learning_rate':list(map(float,learning_rate)),
91                  'min_dist':list(map(float,min_dist)),
92                  'local_connectivity':list(map(int,local_connectivity)),
93                  }
94
95     hdbscan_args = {'min_cluster_size':list(map(int,min_cluster_sizes)),
96                     'min_samples':list(map(int,min_samples)),
97                     'cluster_selection_epsilon':list(map(float,cluster_selection_epsilons)),
98                     'cluster_selection_method':cluster_selection_methods}
99
100     obj = umap_hdbscan_lsi_grid_sweep(inpath,
101                                       lsi_dimensions,
102                                       outpath,
103                                       umap_args,
104                                       hdbscan_args,
105                                       save_step1
106                                       )
107                                  
108
109     obj.run(10)
110     obj.save(savefile)
111
112
113 if __name__ == "__main__":
114     fire.Fire(run_umap_hdbscan_lsi_grid_sweep)

Community Data Science Collective || Want to submit a patch?