1 from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat
2 from dataclasses import dataclass
4 from sklearn.neighbors import NearestNeighbors
7 from itertools import product, starmap
9 from sklearn.metrics import silhouette_score, silhouette_samples
10 from pathlib import Path
11 from multiprocessing import Pool, cpu_count
13 from pyarrow.feather import write_feather
15 def test_select_hdbscan_clustering():
16 select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
17 "test_hdbscan_author30k",
18 min_cluster_sizes=[2],
20 cluster_selection_epsilons=[0,0.05,0.1,0.15],
21 cluster_selection_methods=['eom','leaf'],
23 inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI"
24 outpath = "test_hdbscan";
25 min_cluster_sizes=[2,3,4];
27 cluster_selection_epsilons=[0,0.1,0.3,0.5];
28 cluster_selection_methods=['eom'];
31 df = pd.read_csv("test_hdbscan/selection_data.csv")
32 test_select_hdbscan_clustering()
33 check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather")
34 silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather")
35 c = check_clusters.merge(silscores,on='subreddit')# fire.Fire(select_hdbscan_clustering)
39 class hdbscan_clustering_result(clustering_result):
42 cluster_selection_epsilon:float
43 cluster_selection_method:str
46 silhouette_samples:str
48 def select_hdbscan_clustering(inpath,
51 min_cluster_sizes=[2],
53 cluster_selection_epsilons=[0],
54 cluster_selection_methods=['eom'],
59 outpath = Path(outpath)
60 outpath.mkdir(exist_ok=True, parents=True)
62 if lsi_dimensions == 'all':
63 lsi_paths = list(inpath.glob("*"))
66 lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions]
68 lsi_nums = [p.stem for p in lsi_paths]
69 grid = list(product(lsi_nums,
72 cluster_selection_epsilons,
73 cluster_selection_methods))
75 # fix the output file names
76 names = list(map(lambda t:'_'.join(map(str,t)),grid))
78 grid = [(inpath/(str(t[0])+'.feather'),outpath/(name + '.feather'), t[0], name) + t[1:] for t, name in zip(grid, names)]
80 with Pool(int(cpu_count()/4)) as pool:
81 mods = starmap(hdbscan_clustering, grid)
83 res = pd.DataFrame(mods)
85 outfile = outpath / "selection_data.csv"
89 def hdbscan_clustering(similarities, output, lsi_dim, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
90 subreddits, mat = read_similarity_mat(similarities)
91 mat = sim_to_dist(mat)
92 clustering = _hdbscan_clustering(mat,
93 min_cluster_size=min_cluster_size,
94 min_samples=min_samples,
95 cluster_selection_epsilon=cluster_selection_epsilon,
96 cluster_selection_method=cluster_selection_method,
98 core_dist_n_jobs=cpu_count()
101 cluster_data = process_clustering_result(clustering, subreddits)
102 isolates = clustering.labels_ == -1
103 scoremat = mat[~isolates][:,~isolates]
104 score = silhouette_score(scoremat, clustering.labels_[~isolates], metric='precomputed')
105 cluster_data.to_feather(output)
107 silhouette_samp = silhouette_samples(mat, clustering.labels_, metric='precomputed')
108 silhouette_samp = pd.DataFrame({'subreddit':subreddits,'score':silhouette_samp})
109 silsampout = output.parent / ("silhouette_samples" + output.name)
110 silhouette_samp.to_feather(silsampout)
112 result = hdbscan_clustering_result(outpath=output,
114 silhouette_samples=silsampout,
115 silhouette_score=score,
116 alt_silhouette_score=score,
118 min_cluster_size=min_cluster_size,
119 min_samples=min_samples,
120 cluster_selection_epsilon=cluster_selection_epsilon,
121 cluster_selection_method=cluster_selection_method,
122 lsi_dimensions=lsi_dim,
123 n_isolates=isolates.sum(),
124 n_clusters=len(set(clustering.labels_))
131 # for all runs we should try cluster_selection_epsilon = None
132 # for terms we should try cluster_selection_epsilon around 0.56-0.66
133 # for authors we should try cluster_selection_epsilon around 0.98-0.99
134 def _hdbscan_clustering(mat, *args, **kwargs):
135 print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}")
138 clusterer = hdbscan.HDBSCAN(*args,
142 clustering = clusterer.fit(mat.astype('double'))
146 def KNN_distances_plot(mat,outname,k=2):
147 nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat)
148 distances, indices = nbrs.kneighbors(mat)
150 df = pd.DataFrame({'dist':d2})
151 df = df.sort_values("dist",ascending=False)
152 df['idx'] = np.arange(0,d2.shape[0]) + 1
153 p = pn.qplot(x='idx',y='dist',data=df,geom='line') + pn.scales.scale_y_continuous(minor_breaks = np.arange(0,50)/50,
154 breaks = np.arange(0,10)/10)
155 p.save(outname,width=16,height=10)
157 def make_KNN_plots():
158 similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_10k.feather"
159 subreddits, mat = read_similarity_mat(similarities)
160 mat = sim_to_dist(mat)
162 KNN_distances_plot(mat,k=2,outname='terms_knn_dist2.png')
164 similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10k.feather"
165 subreddits, mat = read_similarity_mat(similarities)
166 mat = sim_to_dist(mat)
167 KNN_distances_plot(mat,k=2,outname='authors_knn_dist2.png')
169 similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k.feather"
170 subreddits, mat = read_similarity_mat(similarities)
171 mat = sim_to_dist(mat)
172 KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png')
174 if __name__ == "__main__":
175 fire.Fire(select_hdbscan_clustering)