]> code.communitydata.science - cdsc_reddit.git/blobdiff - clustering/umap_hdbscan_clustering_lsi.py
changes for archiving.
[cdsc_reddit.git] / clustering / umap_hdbscan_clustering_lsi.py
diff --git a/clustering/umap_hdbscan_clustering_lsi.py b/clustering/umap_hdbscan_clustering_lsi.py
deleted file mode 100644 (file)
index 09b3630..0000000
+++ /dev/null
@@ -1,114 +0,0 @@
-from umap_hdbscan_clustering import umap_hdbscan_job, umap_hdbscan_grid_sweep, umap_hdbscan_clustering_result
-from lsi_base import twoway_lsi_grid_sweep, lsi_mixin, lsi_result_mixin
-from grid_sweep import twoway_grid_sweep
-import fire
-from dataclasses import dataclass
-
-@dataclass
-class umap_hdbscan_clustering_result_lsi(umap_hdbscan_clustering_result, lsi_result_mixin):
-    pass 
-
-class umap_hdbscan_lsi_job(umap_hdbscan_job, lsi_mixin):
-    def __init__(self, infile, outpath, name, umap_args, hdbscan_args, lsi_dims, save_step1=False):
-        super().__init__(
-            infile,
-            outpath,
-            name,
-            umap_args,
-            hdbscan_args,
-            save_step1
-        )
-        super().set_lsi_dims(lsi_dims)
-
-    def get_info(self):
-        partial_result = super().get_info()
-        self.result = umap_hdbscan_clustering_result_lsi(**partial_result.__dict__,
-                                                         lsi_dimensions=self.lsi_dims)
-        return self.result
-
-class umap_hdbscan_lsi_grid_sweep(twoway_lsi_grid_sweep):
-    def __init__(self,
-                 inpath,
-                 lsi_dims,
-                 outpath,
-                 umap_args,
-                 hdbscan_args,
-                 save_step1
-                 ):
-
-        super().__init__(umap_hdbscan_lsi_job,
-                         _umap_hdbscan_lsi_grid_sweep,
-                         inpath,
-                         lsi_dims,
-                         outpath,
-                         umap_args,
-                         hdbscan_args,
-                         save_step1
-                         )
-        
-
-
-class _umap_hdbscan_lsi_grid_sweep(twoway_grid_sweep):
-    def __init__(self,
-                 inpath,
-                 outpath,
-                 lsi_dim,
-                 umap_args,
-                 hdbscan_args,
-                 save_step1):
-
-        self.lsi_dim = lsi_dim
-        self.jobtype = umap_hdbscan_lsi_job
-        super().__init__(self.jobtype, inpath, outpath, self.namer, umap_args, hdbscan_args, save_step1, lsi_dim)
-
-
-    def namer(self, *args, **kwargs):
-        s = umap_hdbscan_grid_sweep.namer(self, *args, **kwargs)
-        s += f"_lsi-{self.lsi_dim}"
-        return s
-
-def run_umap_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, n_neighbors = [15], learning_rate=[1], min_dist=[1], local_connectivity=[1],
-                                    min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom'], lsi_dimensions='all', save_step1 = False):
-    """Run hdbscan clustering once or more with different parameters.
-    
-    Usage:
-    hdbscan_clustering_lsi --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --min_cluster_sizes=<csv> --min_samples=<csv> --cluster_selection_epsilons=<csv> --cluster_selection_methods=[eom]> --lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
-
-    Keword arguments:
-    savefile: path to save the metadata and diagnostics 
-    inpath: path to folder containing feather files with LSI similarity labeled matrices of subreddit similarities.
-    outpath: path to output fit clusterings.
-    min_cluster_sizes: one or more integers indicating the minumum cluster size
-    min_samples: one ore more integers indicating the minimum number of samples used in the algorithm
-    cluster_selection_epsilons: one or more similarity thresholds for transition from dbscan to hdbscan
-    cluster_selection_methods: one or more of "eom" or "leaf" eom gives larger clusters. 
-    lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
-    """    
-
-
-    umap_args = {'n_neighbors':list(map(int, n_neighbors)),
-                 'learning_rate':list(map(float,learning_rate)),
-                 'min_dist':list(map(float,min_dist)),
-                 'local_connectivity':list(map(int,local_connectivity)),
-                 }
-
-    hdbscan_args = {'min_cluster_size':list(map(int,min_cluster_sizes)),
-                    'min_samples':list(map(int,min_samples)),
-                    'cluster_selection_epsilon':list(map(float,cluster_selection_epsilons)),
-                    'cluster_selection_method':cluster_selection_methods}
-
-    obj = umap_hdbscan_lsi_grid_sweep(inpath,
-                                      lsi_dimensions,
-                                      outpath,
-                                      umap_args,
-                                      hdbscan_args,
-                                      save_step1
-                                      )
-                                 
-
-    obj.run(10)
-    obj.save(savefile)
-
-
-if __name__ == "__main__":
-    fire.Fire(run_umap_hdbscan_lsi_grid_sweep)

Community Data Science Collective || Want to submit a patch?