X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/blobdiff_plain/8d1df5b26ee80fee639e5b3ecd057fe8e72f166c..55b75ea6fcf421e95f4fe6b180dcec6e64676619:/clustering/kmeans_clustering.py?ds=inline diff --git a/clustering/kmeans_clustering.py b/clustering/kmeans_clustering.py index 8822e9f..211b666 100644 --- a/clustering/kmeans_clustering.py +++ b/clustering/kmeans_clustering.py @@ -1,101 +1,105 @@ from sklearn.cluster import KMeans import fire from pathlib import Path -from multiprocessing import cpu_count from dataclasses import dataclass -from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat +from clustering_base import clustering_result, clustering_job +from grid_sweep import grid_sweep @dataclass class kmeans_clustering_result(clustering_result): n_clusters:int n_init:int - -def kmeans_clustering(similarities, *args, **kwargs): - subreddits, mat = read_similarity_mat(similarities) - mat = sim_to_dist(mat) - clustering = _kmeans_clustering(mat, *args, **kwargs) - cluster_data = process_clustering_result(clustering, subreddits) - return(cluster_data) - -def _kmeans_clustering(mat, output, n_clusters, n_init=10, max_iter=100000, random_state=1968, verbose=True): - - clustering = KMeans(n_clusters=n_clusters, - n_init=n_init, - max_iter=max_iter, - random_state=random_state, - verbose=verbose - ).fit(mat) - - return clustering - -def do_clustering(n_clusters, n_init, name, mat, subreddits, max_iter, outdir:Path, random_state, verbose, alt_mat, overwrite=False): - if name is None: - name = f"damping-{damping}_convergenceIter-{convergence_iter}_preferenceQuantile-{preference_quantile}" - print(name) - sys.stdout.flush() - outpath = outdir / (str(name) + ".feather") - print(outpath) - mat = sim_to_dist(mat) - clustering = _kmeans_clustering(mat, outpath, n_clusters, n_init, max_iter, random_state, verbose) - - outpath.parent.mkdir(parents=True,exist_ok=True) - cluster_data.to_feather(outpath) - cluster_data = process_clustering_result(clustering, subreddits) - - try: - score = silhouette_score(mat, clustering.labels_, metric='precomputed') - except ValueError: - score = None - - if alt_mat is not None: - alt_distances = sim_to_dist(alt_mat) - try: - alt_score = silhouette_score(alt_mat, clustering.labels_, metric='precomputed') - except ValueError: - alt_score = None + max_iter:int + +class kmeans_job(clustering_job): + def __init__(self, infile, outpath, name, n_clusters, n_init=10, max_iter=100000, random_state=1968, verbose=True): + super().__init__(infile, + outpath, + name, + call=kmeans_job._kmeans_clustering, + n_clusters=n_clusters, + n_init=n_init, + max_iter=max_iter, + random_state=random_state, + verbose=verbose) + + self.n_clusters=n_clusters + self.n_init=n_init + self.max_iter=max_iter + + def _kmeans_clustering(mat, *args, **kwargs): + + clustering = KMeans(*args, + **kwargs, + ).fit(mat) + + return clustering + + + def get_info(self): + result = super().get_info() + self.result = kmeans_clustering_result(**result.__dict__, + n_init=self.n_init, + max_iter=self.max_iter) + return self.result + + +class kmeans_grid_sweep(grid_sweep): + + def __init__(self, + inpath, + outpath, + *args, + **kwargs): + super().__init__(kmeans_job, inpath, outpath, self.namer, *args, **kwargs) + + def namer(self, + n_clusters, + n_init, + max_iter): + return f"nclusters-{n_clusters}_nit-{n_init}_maxit-{max_iter}" + +def test_select_kmeans_clustering(): + inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/" + outpath = "test_kmeans"; + n_clusters=[200,300,400]; + n_init=[1,2,3]; + max_iter=[100000] + + gs = kmeans_lsi_grid_sweep(inpath, 'all', outpath, n_clusters, n_init, max_iter) + gs.run(1) + + cluster_selection_epsilons=[0,0.1,0.3,0.5]; + cluster_selection_methods=['eom']; + lsi_dimensions='all' + gs = hdbscan_lsi_grid_sweep(inpath, "all", outpath, min_cluster_sizes, min_samples, cluster_selection_epsilons, cluster_selection_methods) + gs.run(20) + gs.save("test_hdbscan/lsi_sweep.csv") + +def run_kmeans_grid_sweep(savefile, inpath, outpath, n_clusters=[500], n_inits=[1], max_iters=[3000]): + """Run kmeans clustering once or more with different parameters. - res = kmeans_clustering_result(outpath=outpath, - max_iter=max_iter, - n_clusters=n_clusters, - n_init = n_init, - silhouette_score=score, - alt_silhouette_score=score, - name=str(name)) - - return res - - -# alt similiarities is for checking the silhouette coefficient of an alternative measure of similarity (e.g., topic similarities for user clustering). -def select_kmeans_clustering(similarities, outdir, outinfo, n_clusters=[1000], max_iter=100000, n_init=10, random_state=1968, verbose=True, alt_similarities=None): - - n_clusters = list(map(int,n_clusters)) - n_init = list(map(int,n_init)) + Usage: + kmeans_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --n_clusters= --n_inits= --max_iters= - if type(outdir) is str: - outdir = Path(outdir) + Keword arguments: + savefile: path to save the metadata and diagnostics + inpath: path to feather data containing a labeled matrix of subreddit similarities. + outpath: path to output fit kmeans clusterings. + n_clusters: one or more numbers of kmeans clusters to select. + n_inits: one or more numbers of different initializations to use for each clustering. + max_iters: one or more numbers of different maximum interations. + """ - outdir.mkdir(parents=True,exist_ok=True) + obj = kmeans_grid_sweep(inpath, + outpath, + map(int,n_clusters), + map(int,n_inits), + map(int,max_iters)) - subreddits, mat = read_similarity_mat(similarities,use_threads=True) - if alt_similarities is not None: - alt_mat = read_similarity_mat(alt_similarities,use_threads=True) - else: - alt_mat = None - - # get list of tuples: the combinations of hyperparameters - hyper_grid = product(n_clusters, n_init) - hyper_grid = (t + (str(i),) for i, t in enumerate(hyper_grid)) - - _do_clustering = partial(do_clustering, mat=mat, subreddits=subreddits, outdir=outdir, max_iter=max_iter, random_state=random_state, verbose=verbose, alt_mat=alt_mat) - - # call starmap - print("running clustering selection") - clustering_data = starmap(_do_clustering, hyper_grid) - clustering_data = pd.DataFrame(list(clustering_data)) - clustering_data.to_csv(outinfo) - - return clustering_data + obj.run(1) + obj.save(savefile) if __name__ == "__main__": - x = fire.Fire(select_kmeans_clustering) + fire.Fire(run_kmeans_grid_sweep)