-
-def kmeans_clustering(similarities, *args, **kwargs):
- subreddits, mat = read_similarity_mat(similarities)
- mat = sim_to_dist(mat)
- clustering = _kmeans_clustering(mat, *args, **kwargs)
- cluster_data = process_clustering_result(clustering, subreddits)
- return(cluster_data)
-
-def _kmeans_clustering(mat, output, n_clusters, n_init=10, max_iter=100000, random_state=1968, verbose=True):
-
- clustering = KMeans(n_clusters=n_clusters,
- n_init=n_init,
- max_iter=max_iter,
- random_state=random_state,
- verbose=verbose
- ).fit(mat)
-
- return clustering
-
-def do_clustering(n_clusters, n_init, name, mat, subreddits, max_iter, outdir:Path, random_state, verbose, alt_mat, overwrite=False):
- if name is None:
- name = f"damping-{damping}_convergenceIter-{convergence_iter}_preferenceQuantile-{preference_quantile}"
- print(name)
- sys.stdout.flush()
- outpath = outdir / (str(name) + ".feather")
- print(outpath)
- mat = sim_to_dist(mat)
- clustering = _kmeans_clustering(mat, outpath, n_clusters, n_init, max_iter, random_state, verbose)
-
- outpath.parent.mkdir(parents=True,exist_ok=True)
- cluster_data.to_feather(outpath)
- cluster_data = process_clustering_result(clustering, subreddits)
-
- try:
- score = silhouette_score(mat, clustering.labels_, metric='precomputed')
- except ValueError:
- score = None
-
- if alt_mat is not None:
- alt_distances = sim_to_dist(alt_mat)
- try:
- alt_score = silhouette_score(alt_mat, clustering.labels_, metric='precomputed')
- except ValueError:
- alt_score = None
+ max_iter:int
+
+class kmeans_job(clustering_job):
+ def __init__(self, infile, outpath, name, n_clusters, n_init=10, max_iter=100000, random_state=1968, verbose=True):
+ super().__init__(infile,
+ outpath,
+ name,
+ call=kmeans_job._kmeans_clustering,
+ n_clusters=n_clusters,
+ n_init=n_init,
+ max_iter=max_iter,
+ random_state=random_state,
+ verbose=verbose)
+
+ self.n_clusters=n_clusters
+ self.n_init=n_init
+ self.max_iter=max_iter
+
+ def _kmeans_clustering(mat, *args, **kwargs):
+
+ clustering = KMeans(*args,
+ **kwargs,
+ ).fit(mat)
+
+ return clustering
+
+
+ def get_info(self):
+ result = super().get_info()
+ self.result = kmeans_clustering_result(**result.__dict__,
+ n_init=self.n_init,
+ max_iter=self.max_iter)
+ return self.result
+
+
+class kmeans_grid_sweep(grid_sweep):
+
+ def __init__(self,
+ inpath,
+ outpath,
+ *args,
+ **kwargs):
+ super().__init__(kmeans_job, inpath, outpath, self.namer, *args, **kwargs)
+
+ def namer(self,
+ n_clusters,
+ n_init,
+ max_iter):
+ return f"nclusters-{n_clusters}_nit-{n_init}_maxit-{max_iter}"
+
+def test_select_kmeans_clustering():
+ inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/"
+ outpath = "test_kmeans";
+ n_clusters=[200,300,400];
+ n_init=[1,2,3];
+ max_iter=[100000]
+
+ gs = kmeans_lsi_grid_sweep(inpath, 'all', outpath, n_clusters, n_init, max_iter)
+ gs.run(1)
+
+ cluster_selection_epsilons=[0,0.1,0.3,0.5];
+ cluster_selection_methods=['eom'];
+ lsi_dimensions='all'
+ gs = hdbscan_lsi_grid_sweep(inpath, "all", outpath, min_cluster_sizes, min_samples, cluster_selection_epsilons, cluster_selection_methods)
+ gs.run(20)
+ gs.save("test_hdbscan/lsi_sweep.csv")
+
+def run_kmeans_grid_sweep(savefile, inpath, outpath, n_clusters=[500], n_inits=[1], max_iters=[3000]):
+ """Run kmeans clustering once or more with different parameters.