X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/blobdiff_plain/5a40465a629a1d7d95dbec9730d3950842bcb4f5..07b0dff9bc0dae2ab6f7fb7334007a5269a512ad:/clustering/umap_hdbscan_clustering_lsi.py diff --git a/clustering/umap_hdbscan_clustering_lsi.py b/clustering/umap_hdbscan_clustering_lsi.py deleted file mode 100644 index 09b3630..0000000 --- a/clustering/umap_hdbscan_clustering_lsi.py +++ /dev/null @@ -1,114 +0,0 @@ -from umap_hdbscan_clustering import umap_hdbscan_job, umap_hdbscan_grid_sweep, umap_hdbscan_clustering_result -from lsi_base import twoway_lsi_grid_sweep, lsi_mixin, lsi_result_mixin -from grid_sweep import twoway_grid_sweep -import fire -from dataclasses import dataclass - -@dataclass -class umap_hdbscan_clustering_result_lsi(umap_hdbscan_clustering_result, lsi_result_mixin): - pass - -class umap_hdbscan_lsi_job(umap_hdbscan_job, lsi_mixin): - def __init__(self, infile, outpath, name, umap_args, hdbscan_args, lsi_dims, save_step1=False): - super().__init__( - infile, - outpath, - name, - umap_args, - hdbscan_args, - save_step1 - ) - super().set_lsi_dims(lsi_dims) - - def get_info(self): - partial_result = super().get_info() - self.result = umap_hdbscan_clustering_result_lsi(**partial_result.__dict__, - lsi_dimensions=self.lsi_dims) - return self.result - -class umap_hdbscan_lsi_grid_sweep(twoway_lsi_grid_sweep): - def __init__(self, - inpath, - lsi_dims, - outpath, - umap_args, - hdbscan_args, - save_step1 - ): - - super().__init__(umap_hdbscan_lsi_job, - _umap_hdbscan_lsi_grid_sweep, - inpath, - lsi_dims, - outpath, - umap_args, - hdbscan_args, - save_step1 - ) - - - -class _umap_hdbscan_lsi_grid_sweep(twoway_grid_sweep): - def __init__(self, - inpath, - outpath, - lsi_dim, - umap_args, - hdbscan_args, - save_step1): - - self.lsi_dim = lsi_dim - self.jobtype = umap_hdbscan_lsi_job - super().__init__(self.jobtype, inpath, outpath, self.namer, umap_args, hdbscan_args, save_step1, lsi_dim) - - - def namer(self, *args, **kwargs): - s = umap_hdbscan_grid_sweep.namer(self, *args, **kwargs) - s += f"_lsi-{self.lsi_dim}" - return s - -def run_umap_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, n_neighbors = [15], learning_rate=[1], min_dist=[1], local_connectivity=[1], - min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom'], lsi_dimensions='all', save_step1 = False): - """Run hdbscan clustering once or more with different parameters. - - Usage: - hdbscan_clustering_lsi --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --min_cluster_sizes= --min_samples= --cluster_selection_epsilons= --cluster_selection_methods=[eom]> --lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH. - - Keword arguments: - savefile: path to save the metadata and diagnostics - inpath: path to folder containing feather files with LSI similarity labeled matrices of subreddit similarities. - outpath: path to output fit clusterings. - min_cluster_sizes: one or more integers indicating the minumum cluster size - min_samples: one ore more integers indicating the minimum number of samples used in the algorithm - cluster_selection_epsilons: one or more similarity thresholds for transition from dbscan to hdbscan - cluster_selection_methods: one or more of "eom" or "leaf" eom gives larger clusters. - lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH. - """ - - - umap_args = {'n_neighbors':list(map(int, n_neighbors)), - 'learning_rate':list(map(float,learning_rate)), - 'min_dist':list(map(float,min_dist)), - 'local_connectivity':list(map(int,local_connectivity)), - } - - hdbscan_args = {'min_cluster_size':list(map(int,min_cluster_sizes)), - 'min_samples':list(map(int,min_samples)), - 'cluster_selection_epsilon':list(map(float,cluster_selection_epsilons)), - 'cluster_selection_method':cluster_selection_methods} - - obj = umap_hdbscan_lsi_grid_sweep(inpath, - lsi_dimensions, - outpath, - umap_args, - hdbscan_args, - save_step1 - ) - - - obj.run(10) - obj.save(savefile) - - -if __name__ == "__main__": - fire.Fire(run_umap_hdbscan_lsi_grid_sweep)