--- /dev/null
+from sklearn.cluster import KMeans
+import fire
+from pathlib import Path
+from multiprocessing import cpu_count
+from dataclasses import dataclass
+from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat
+from clustering_base import lsi_result_mixin, lsi_mixin, clustering_job, grid_sweep, lsi_grid_sweep
+
+
+@dataclass
+class kmeans_clustering_result(clustering_result):
+ n_clusters:int
+ n_init:int
+ max_iter:int
+
+@dataclass
+class kmeans_clustering_result_lsi(kmeans_clustering_result, lsi_result_mixin):
+ pass
+
+class kmeans_job(clustering_job):
+ def __init__(self, infile, outpath, name, n_clusters, n_init=10, max_iter=100000, random_state=1968, verbose=True):
+ super().__init__(infile,
+ outpath,
+ name,
+ call=kmeans_job._kmeans_clustering,
+ n_clusters=n_clusters,
+ n_init=n_init,
+ max_iter=max_iter,
+ random_state=random_state,
+ verbose=verbose)
+
+ self.n_clusters=n_clusters
+ self.n_init=n_init
+ self.max_iter=max_iter
+
+ def _kmeans_clustering(mat, *args, **kwargs):
+
+ clustering = KMeans(*args,
+ **kwargs,
+ ).fit(mat)
+
+ return clustering
+
+
+ def get_info(self):
+ result = super().get_info()
+ self.result = kmeans_clustering_result(**result.__dict__,
+ n_init=n_init,
+ max_iter=max_iter)
+ return self.result
+
+
+class kmeans_lsi_job(kmeans_job, lsi_mixin):
+ def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
+ super().__init__(infile,
+ outpath,
+ name,
+ *args,
+ **kwargs)
+ super().set_lsi_dims(lsi_dims)
+
+ def get_info(self):
+ result = super().get_info()
+ self.result = kmeans_clustering_result_lsi(**result.__dict__,
+ lsi_dimensions=self.lsi_dims)
+ return self.result
+
+
+class kmeans_grid_sweep(grid_sweep):
+ def __init__(self,
+ inpath,
+ outpath,
+ *args,
+ **kwargs):
+ super().__init__(kmeans_job, inpath, outpath, self.namer, *args, **kwargs)
+
+ def namer(self,
+ n_clusters,
+ n_init,
+ max_iter):
+ return f"nclusters-{n_clusters}_nit-{n_init}_maxit-{max_iter}"
+
+class _kmeans_lsi_grid_sweep(grid_sweep):
+ def __init__(self,
+ inpath,
+ outpath,
+ lsi_dim,
+ *args,
+ **kwargs):
+ self.lsi_dim = lsi_dim
+ self.jobtype = kmeans_lsi_job
+ super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs)
+
+ def namer(self, *args, **kwargs):
+ s = kmeans_grid_sweep.namer(self, *args[1:], **kwargs)
+ s += f"_lsi-{self.lsi_dim}"
+ return s
+
+class kmeans_lsi_grid_sweep(lsi_grid_sweep):
+ def __init__(self,
+ inpath,
+ lsi_dims,
+ outpath,
+ n_clusters,
+ n_inits,
+ max_iters
+ ):
+
+ super().__init__(kmeans_lsi_job,
+ _kmeans_lsi_grid_sweep,
+ inpath,
+ lsi_dims,
+ outpath,
+ n_clusters,
+ n_inits,
+ max_iters)
+
+def test_select_kmeans_clustering():
+ # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
+ # "test_hdbscan_author30k",
+ # min_cluster_sizes=[2],
+ # min_samples=[1,2],
+ # cluster_selection_epsilons=[0,0.05,0.1,0.15],
+ # cluster_selection_methods=['eom','leaf'],
+ # lsi_dimensions='all')
+ inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/"
+ outpath = "test_kmeans";
+ n_clusters=[200,300,400];
+ n_init=[1,2,3];
+ max_iter=[100000]
+
+ gs = kmeans_lsi_grid_sweep(inpath, 'all', outpath, n_clusters, n_init, max_iter)
+ gs.run(1)
+
+ cluster_selection_epsilons=[0,0.1,0.3,0.5];
+ cluster_selection_methods=['eom'];
+ lsi_dimensions='all'
+ gs = hdbscan_lsi_grid_sweep(inpath, "all", outpath, min_cluster_sizes, min_samples, cluster_selection_epsilons, cluster_selection_methods)
+ gs.run(20)
+ gs.save("test_hdbscan/lsi_sweep.csv")
+
+
+if __name__ == "__main__":
+
+ fire.Fire{'grid_sweep':kmeans_grid_sweep,
+ 'grid_sweep_lsi':kmeans_lsi_grid_sweep
+ 'cluster':kmeans_job,
+ 'cluster_lsi':kmeans_lsi_job}