]> code.communitydata.science - cdsc_reddit.git/commitdiff
add 2 more umap parameters excise_reindex
authorNathan TeBlunthuis <nathante@uw.edu>
Thu, 9 Jun 2022 00:27:37 +0000 (17:27 -0700)
committerNathan TeBlunthuis <nathante@uw.edu>
Thu, 9 Jun 2022 00:27:37 +0000 (17:27 -0700)
clustering/Makefile
clustering/lsi_base.py
clustering/umap_hdbscan_clustering.py
clustering/umap_hdbscan_clustering_lsi.py

index 2ba9c0cd9ca48fafcd7155a0aab24260f5c22c7f..559a85ca0f8d01dc02a6b951f26f1dce97c59dee 100644 (file)
@@ -1,10 +1,10 @@
 #srun_cdsc='srun -p comdata-int -A comdata --time=300:00:00 --time-min=00:15:00 --mem=100G --ntasks=1 --cpus-per-task=28'
-srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh
+srun_singularity=srun -p compute-bigmem -A comdata --time=48:00:00 --mem=362G -c 40
 similarity_data=/gscratch/comdata/output/reddit_similarity
 clustering_data=/gscratch/comdata/output/reddit_clustering
 kmeans_selection_grid=--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000]
 
-umap_hdbscan_selection_grid=--min_cluster_sizes=[2] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf] --n_neighbors=[5,15,25,50,75,100] --learning_rate=[1] --min_dist=[0,0.1,0.25,0.5,0.75,0.9,0.99] --local_connectivity=[1]
+umap_hdbscan_selection_grid=--min_cluster_sizes=[2] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf] --n_neighbors=[5,15,25,50,75,100] --learning_rate=[1] --min_dist=[0,0.1,0.25,0.5,0.75,0.9,0.99] --local_connectivity=[1] --densmap=[True,False] --n_components=[2,5,10]
 
 hdbscan_selection_grid=--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf]
 affinity_selection_grid=--dampings=[0.5,0.6,0.7,0.8,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[15]
index 14bbfc55f8b0263209c9124afefad17dda4faaae..84dfa7bb424b412bdc862645fd035c975f2ef0ab 100644 (file)
@@ -29,7 +29,7 @@ class lsi_grid_sweep(grid_sweep):
         self.jobs = list(chain(*map(lambda gs: gs.jobs, self.subgrids)))
 
 class twoway_lsi_grid_sweep(twoway_grid_sweep):
-    def __init__(self, jobtype, subsweep, inpath, lsi_dimensions, outpath, args1, args2, save_step1):
+    def __init__(self, jobtype, subsweep, inpath, lsi_dimensions, outpath, args1, args2):
         self.jobtype = jobtype
         self.subsweep = subsweep
         inpath = Path(inpath)
@@ -40,5 +40,5 @@ class twoway_lsi_grid_sweep(twoway_grid_sweep):
 
         lsi_nums = [int(p.stem) for p in lsi_paths]
         self.hasrun = False
-        self.subgrids = [self.subsweep(lsi_path, outpath, lsi_dim, args1, args2, save_step1) for lsi_dim, lsi_path in zip(lsi_nums, lsi_paths)]
+        self.subgrids = [self.subsweep(lsi_path, outpath, lsi_dim, args1, args2) for lsi_dim, lsi_path in zip(lsi_nums, lsi_paths)]
         self.jobs = list(chain(*map(lambda gs: gs.jobs, self.subgrids)))
index 6a4d2a1488217aec9e4d4f23d1ed519c3b030d9e..5633d770391f06dd5488682b81cf03b6a6b4465e 100644 (file)
@@ -63,25 +63,28 @@ class umap_hdbscan_grid_sweep(twoway_grid_sweep):
               min_samples,
               cluster_selection_epsilon,
               cluster_selection_method,
+              n_components,
               n_neighbors,
               learning_rate,
               min_dist,
-              local_connectivity
+              local_connectivity,
+              densmap
               ):
-        return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}_nn-{n_neighbors}_lr-{learning_rate}_md-{min_dist}_lc-{local_connectivity}"
+        return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}_nc-{n_components}_nn-{n_neighbors}_lr-{learning_rate}_md-{min_dist}_lc-{local_connectivity}_dm-{densmap}"
 
 @dataclass
 class umap_hdbscan_clustering_result(hdbscan_clustering_result):
+    n_components:int
     n_neighbors:int
     learning_rate:float
     min_dist:float
     local_connectivity:int
+    densmap:bool
 
 class umap_hdbscan_job(twoway_clustering_job):
     def __init__(self, infile, outpath, name,
-                 umap_args = {"n_neighbors":15, "learning_rate":1, "min_dist":1, "local_connectivity":1},
+                 umap_args = {"n_components":2,"n_neighbors":15, "learning_rate":1, "min_dist":1, "local_connectivity":1,'densmap':False},
                  hdbscan_args = {"min_cluster_size":2, "min_samples":1, "cluster_selection_epsilon":0, "cluster_selection_method":'eom'},
-                 save_step1 = False,
                  *args,
                  **kwargs):
         super().__init__(infile,
@@ -91,15 +94,16 @@ class umap_hdbscan_job(twoway_clustering_job):
                          call2=umap_hdbscan_job._hdbscan_clustering,
                          args1=umap_args,
                          args2=hdbscan_args,
-                         save_step1=save_step1,
                          *args,
                          **kwargs
                          )
 
+        self.n_components = umap_args['n_components']
         self.n_neighbors = umap_args['n_neighbors']
         self.learning_rate = umap_args['learning_rate']
         self.min_dist = umap_args['min_dist']
         self.local_connectivity = umap_args['local_connectivity']
+        self.densmap = umap_args['densmap']
         self.min_cluster_size = hdbscan_args['min_cluster_size']
         self.min_samples = hdbscan_args['min_samples']
         self.cluster_selection_epsilon = hdbscan_args['cluster_selection_epsilon']
@@ -139,14 +143,17 @@ class umap_hdbscan_job(twoway_clustering_job):
                                                      min_samples=self.min_samples,
                                                      cluster_selection_epsilon=self.cluster_selection_epsilon,
                                                      cluster_selection_method=self.cluster_selection_method,
+                                                     n_components = self.n_components,
                                                      n_neighbors = self.n_neighbors,
                                                      learning_rate = self.learning_rate,
                                                      min_dist = self.min_dist,
-                                                     local_connectivity=self.local_connectivity
+                                                     local_connectivity=self.local_connectivity,
+                                                     densmap=self.densmap
                                                      )
         return self.result
 
-def run_umap_hdbscan_grid_sweep(savefile, inpath, outpath, n_neighbors = [15], learning_rate=[1], min_dist=[1], local_connectivity=[1],
+def run_umap_hdbscan_grid_sweep(savefile, inpath, outpath, n_neighbors = [15], n_components=[2], learning_rate=[1], min_dist=[1], local_connectivity=[1],
+                                densmap=[False],
                                 min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom']):
     """Run umap + hdbscan clustering once or more with different parameters.
     
@@ -171,6 +178,8 @@ def run_umap_hdbscan_grid_sweep(savefile, inpath, outpath, n_neighbors = [15], l
                  'learning_rate':list(map(float,learning_rate)),
                  'min_dist':list(map(float,min_dist)),
                  'local_connectivity':list(map(int,local_connectivity)),
+                 'n_components':list(map(int, n_components)),
+                 'densmap':list(map(bool,densmap))
                  }
 
     hdbscan_args = {'min_cluster_size':list(map(int,min_cluster_sizes)),
index 09b36304a7546199e6267b505d26eb75e9b2b139..3149939868aafdf24f37de827743c40cd6da7f9c 100644 (file)
@@ -9,14 +9,13 @@ class umap_hdbscan_clustering_result_lsi(umap_hdbscan_clustering_result, lsi_res
     pass 
 
 class umap_hdbscan_lsi_job(umap_hdbscan_job, lsi_mixin):
-    def __init__(self, infile, outpath, name, umap_args, hdbscan_args, lsi_dims, save_step1=False):
+    def __init__(self, infile, outpath, name, umap_args, hdbscan_args, lsi_dims):
         super().__init__(
             infile,
             outpath,
             name,
             umap_args,
-            hdbscan_args,
-            save_step1
+            hdbscan_args
         )
         super().set_lsi_dims(lsi_dims)
 
@@ -32,8 +31,7 @@ class umap_hdbscan_lsi_grid_sweep(twoway_lsi_grid_sweep):
                  lsi_dims,
                  outpath,
                  umap_args,
-                 hdbscan_args,
-                 save_step1
+                 hdbscan_args
                  ):
 
         super().__init__(umap_hdbscan_lsi_job,
@@ -42,8 +40,7 @@ class umap_hdbscan_lsi_grid_sweep(twoway_lsi_grid_sweep):
                          lsi_dims,
                          outpath,
                          umap_args,
-                         hdbscan_args,
-                         save_step1
+                         hdbscan_args
                          )
         
 
@@ -55,11 +52,11 @@ class _umap_hdbscan_lsi_grid_sweep(twoway_grid_sweep):
                  lsi_dim,
                  umap_args,
                  hdbscan_args,
-                 save_step1):
+                 ):
 
         self.lsi_dim = lsi_dim
         self.jobtype = umap_hdbscan_lsi_job
-        super().__init__(self.jobtype, inpath, outpath, self.namer, umap_args, hdbscan_args, save_step1, lsi_dim)
+        super().__init__(self.jobtype, inpath, outpath, self.namer, umap_args, hdbscan_args, lsi_dim)
 
 
     def namer(self, *args, **kwargs):
@@ -67,8 +64,9 @@ class _umap_hdbscan_lsi_grid_sweep(twoway_grid_sweep):
         s += f"_lsi-{self.lsi_dim}"
         return s
 
-def run_umap_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, n_neighbors = [15], learning_rate=[1], min_dist=[1], local_connectivity=[1],
-                                    min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom'], lsi_dimensions='all', save_step1 = False):
+def run_umap_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, n_neighbors = [15], n_components=[2], learning_rate=[1], min_dist=[1], local_connectivity=[1], 
+                                densmap=[False],
+                                    min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom'], lsi_dimensions='all'):
     """Run hdbscan clustering once or more with different parameters.
     
     Usage:
@@ -90,6 +88,8 @@ def run_umap_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, n_neighbors = [15
                  'learning_rate':list(map(float,learning_rate)),
                  'min_dist':list(map(float,min_dist)),
                  'local_connectivity':list(map(int,local_connectivity)),
+                 'n_components':list(map(int, n_components)),
+                 'densmap':list(map(bool,densmap))
                  }
 
     hdbscan_args = {'min_cluster_size':list(map(int,min_cluster_sizes)),
@@ -101,8 +101,7 @@ def run_umap_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, n_neighbors = [15
                                       lsi_dimensions,
                                       outpath,
                                       umap_args,
-                                      hdbscan_args,
-                                      save_step1
+                                      hdbscan_args
                                       )
                                  
 

Community Data Science Collective || Want to submit a patch?