X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/blobdiff_plain/55b75ea6fcf421e95f4fe6b180dcec6e64676619..5a40465a629a1d7d95dbec9730d3950842bcb4f5:/clustering/umap_hdbscan_clustering.py?ds=sidebyside diff --git a/clustering/umap_hdbscan_clustering.py b/clustering/umap_hdbscan_clustering.py new file mode 100644 index 0000000..6a4d2a1 --- /dev/null +++ b/clustering/umap_hdbscan_clustering.py @@ -0,0 +1,221 @@ +from clustering_base import clustering_result, clustering_job, twoway_clustering_job +from hdbscan_clustering import hdbscan_clustering_result +import umap +from grid_sweep import twoway_grid_sweep +from dataclasses import dataclass +import hdbscan +from sklearn.neighbors import NearestNeighbors +import plotnine as pn +import numpy as np +from itertools import product, starmap, chain +import pandas as pd +from multiprocessing import cpu_count +import fire + +def test_select_hdbscan_clustering(): + # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI", + # "test_hdbscan_author30k", + # min_cluster_sizes=[2], + # min_samples=[1,2], + # cluster_selection_epsilons=[0,0.05,0.1,0.15], + # cluster_selection_methods=['eom','leaf'], + # lsi_dimensions='all') + inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI" + outpath = "test_umap_hdbscan_lsi" + min_cluster_sizes=[2,3,4] + min_samples=[1,2,3] + cluster_selection_epsilons=[0,0.1,0.3,0.5] + cluster_selection_methods=[1] + lsi_dimensions='all' + n_neighbors = [5,10,15,25,35,70,100] + learning_rate = [0.1,0.5,1,2] + min_dist = [0.5,1,1.5,2] + local_connectivity = [1,2,3,4,5] + + hdbscan_params = {"min_cluster_sizes":min_cluster_sizes, "min_samples":min_samples, "cluster_selection_epsilons":cluster_selection_epsilons, "cluster_selection_methods":cluster_selection_methods} + umap_params = {"n_neighbors":n_neighbors, "learning_rate":learning_rate, "min_dist":min_dist, "local_connectivity":local_connectivity} + gs = umap_hdbscan_grid_sweep(inpath, "all", outpath, hdbscan_params,umap_params) + + # gs.run(20) + # gs.save("test_hdbscan/lsi_sweep.csv") + + + # job1 = hdbscan_lsi_job(infile=inpath, outpath=outpath, name="test", lsi_dims=500, min_cluster_size=2, min_samples=1,cluster_selection_epsilon=0,cluster_selection_method='eom') + # job1.run() + # print(job1.get_info()) + + # df = pd.read_csv("test_hdbscan/selection_data.csv") + # test_select_hdbscan_clustering() + # check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather") + # silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather") + # c = check_clusters.merge(silscores,on='subreddit')# fire.Fire(select_hdbscan_clustering) +class umap_hdbscan_grid_sweep(twoway_grid_sweep): + def __init__(self, + inpath, + outpath, + umap_params, + hdbscan_params): + + super().__init__(umap_hdbscan_job, inpath, outpath, self.namer, umap_params, hdbscan_params) + + def namer(self, + min_cluster_size, + min_samples, + cluster_selection_epsilon, + cluster_selection_method, + n_neighbors, + learning_rate, + min_dist, + local_connectivity + ): + return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}_nn-{n_neighbors}_lr-{learning_rate}_md-{min_dist}_lc-{local_connectivity}" + +@dataclass +class umap_hdbscan_clustering_result(hdbscan_clustering_result): + n_neighbors:int + learning_rate:float + min_dist:float + local_connectivity:int + +class umap_hdbscan_job(twoway_clustering_job): + def __init__(self, infile, outpath, name, + umap_args = {"n_neighbors":15, "learning_rate":1, "min_dist":1, "local_connectivity":1}, + hdbscan_args = {"min_cluster_size":2, "min_samples":1, "cluster_selection_epsilon":0, "cluster_selection_method":'eom'}, + save_step1 = False, + *args, + **kwargs): + super().__init__(infile, + outpath, + name, + call1=umap_hdbscan_job._umap_embedding, + call2=umap_hdbscan_job._hdbscan_clustering, + args1=umap_args, + args2=hdbscan_args, + save_step1=save_step1, + *args, + **kwargs + ) + + self.n_neighbors = umap_args['n_neighbors'] + self.learning_rate = umap_args['learning_rate'] + self.min_dist = umap_args['min_dist'] + self.local_connectivity = umap_args['local_connectivity'] + self.min_cluster_size = hdbscan_args['min_cluster_size'] + self.min_samples = hdbscan_args['min_samples'] + self.cluster_selection_epsilon = hdbscan_args['cluster_selection_epsilon'] + self.cluster_selection_method = hdbscan_args['cluster_selection_method'] + + def after_run(self): + coords = self.step1.emedding_ + self.cluster_data['x'] = coords[:,0] + self.cluster_data['y'] = coords[:,1] + super().after_run() + + + def _umap_embedding(mat, **umap_args): + print(f"running umap embedding. umap_args:{umap_args}") + umapmodel = umap.UMAP(metric='precomputed', **umap_args) + umapmodel = umapmodel.fit(mat) + return umapmodel + + def _hdbscan_clustering(mat, umapmodel, **hdbscan_args): + print(f"running hdbascan clustering. hdbscan_args:{hdbscan_args}") + + umap_coords = umapmodel.transform(mat) + + clusterer = hdbscan.HDBSCAN(metric='euclidean', + core_dist_n_jobs=cpu_count(), + **hdbscan_args + ) + + clustering = clusterer.fit(umap_coords) + + return(clustering) + + def get_info(self): + result = super().get_info() + self.result = umap_hdbscan_clustering_result(**result.__dict__, + min_cluster_size=self.min_cluster_size, + min_samples=self.min_samples, + cluster_selection_epsilon=self.cluster_selection_epsilon, + cluster_selection_method=self.cluster_selection_method, + n_neighbors = self.n_neighbors, + learning_rate = self.learning_rate, + min_dist = self.min_dist, + local_connectivity=self.local_connectivity + ) + return self.result + +def run_umap_hdbscan_grid_sweep(savefile, inpath, outpath, n_neighbors = [15], learning_rate=[1], min_dist=[1], local_connectivity=[1], + min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom']): + """Run umap + hdbscan clustering once or more with different parameters. + + Usage: + umap_hdbscan_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --n_neighbors= --learning_rate= --min_dist= --local_connectivity= --min_cluster_sizes= --min_samples= --cluster_selection_epsilons= --cluster_selection_methods= + + Keword arguments: + savefile: path to save the metadata and diagnostics + inpath: path to feather data containing a labeled matrix of subreddit similarities. + outpath: path to output fit kmeans clusterings. + n_neighbors: umap parameter takes integers greater than 1 + learning_rate: umap parameter takes positive real values + min_dist: umap parameter takes positive real values + local_connectivity: umap parameter takes positive integers + min_cluster_sizes: one or more integers indicating the minumum cluster size + min_samples: one ore more integers indicating the minimum number of samples used in the algorithm + cluster_selection_epsilon: one or more similarity thresholds for transition from dbscan to hdbscan + cluster_selection_method: "eom" or "leaf" eom gives larger clusters. + """ + + umap_args = {'n_neighbors':list(map(int, n_neighbors)), + 'learning_rate':list(map(float,learning_rate)), + 'min_dist':list(map(float,min_dist)), + 'local_connectivity':list(map(int,local_connectivity)), + } + + hdbscan_args = {'min_cluster_size':list(map(int,min_cluster_sizes)), + 'min_samples':list(map(int,min_samples)), + 'cluster_selection_epsilon':list(map(float,cluster_selection_epsilons)), + 'cluster_selection_method':cluster_selection_methods} + + obj = umap_hdbscan_grid_sweep(inpath, + outpath, + umap_args, + hdbscan_args) + obj.run(cores=10) + obj.save(savefile) + + +def KNN_distances_plot(mat,outname,k=2): + nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat) + distances, indices = nbrs.kneighbors(mat) + d2 = distances[:,-1] + df = pd.DataFrame({'dist':d2}) + df = df.sort_values("dist",ascending=False) + df['idx'] = np.arange(0,d2.shape[0]) + 1 + p = pn.qplot(x='idx',y='dist',data=df,geom='line') + pn.scales.scale_y_continuous(minor_breaks = np.arange(0,50)/50, + breaks = np.arange(0,10)/10) + p.save(outname,width=16,height=10) + +def make_KNN_plots(): + similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_10k.feather" + subreddits, mat = read_similarity_mat(similarities) + mat = sim_to_dist(mat) + + KNN_distances_plot(mat,k=2,outname='terms_knn_dist2.png') + + similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10k.feather" + subreddits, mat = read_similarity_mat(similarities) + mat = sim_to_dist(mat) + KNN_distances_plot(mat,k=2,outname='authors_knn_dist2.png') + + similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k.feather" + subreddits, mat = read_similarity_mat(similarities) + mat = sim_to_dist(mat) + KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png') + +if __name__ == "__main__": + fire.Fire(run_umap_hdbscan_grid_sweep) + +# test_select_hdbscan_clustering() + #fire.Fire(select_hdbscan_clustering)