1 from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat
2 from clustering_base import lsi_result_mixin, lsi_mixin, clustering_job, grid_sweep, lsi_grid_sweep
3 from dataclasses import dataclass
5 from sklearn.neighbors import NearestNeighbors
8 from itertools import product, starmap, chain
10 from sklearn.metrics import silhouette_score, silhouette_samples
11 from pathlib import Path
12 from multiprocessing import Pool, cpu_count
14 from pyarrow.feather import write_feather
16 def test_select_hdbscan_clustering():
17 # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
18 # "test_hdbscan_author30k",
19 # min_cluster_sizes=[2],
21 # cluster_selection_epsilons=[0,0.05,0.1,0.15],
22 # cluster_selection_methods=['eom','leaf'],
23 # lsi_dimensions='all')
24 inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/"
25 outpath = "test_hdbscan";
26 min_cluster_sizes=[2,3,4];
28 cluster_selection_epsilons=[0,0.1,0.3,0.5];
29 cluster_selection_methods=['eom'];
31 gs = hdbscan_lsi_grid_sweep(inpath, "all", outpath, min_cluster_sizes, min_samples, cluster_selection_epsilons, cluster_selection_methods)
33 gs.save("test_hdbscan/lsi_sweep.csv")
34 # job1 = hdbscan_lsi_job(infile=inpath, outpath=outpath, name="test", lsi_dims=500, min_cluster_size=2, min_samples=1,cluster_selection_epsilon=0,cluster_selection_method='eom')
36 # print(job1.get_info())
38 # df = pd.read_csv("test_hdbscan/selection_data.csv")
39 # test_select_hdbscan_clustering()
40 # check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather")
41 # silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather")
42 # c = check_clusters.merge(silscores,on='subreddit')# fire.Fire(select_hdbscan_clustering)
44 class hdbscan_lsi_grid_sweep(lsi_grid_sweep):
51 cluster_selection_epsilons,
52 cluster_selection_methods
55 super().__init__(hdbscan_lsi_job,
56 _hdbscan_lsi_grid_sweep,
62 cluster_selection_epsilons,
63 cluster_selection_methods)
65 class hdbscan_grid_sweep(grid_sweep):
72 super().__init__(hdbscan_job, inpath, outpath, self.namer, *args, **kwargs)
77 cluster_selection_epsilon,
78 cluster_selection_method):
79 return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}"
82 class _hdbscan_lsi_grid_sweep(grid_sweep):
90 self.lsi_dim = lsi_dim
91 self.jobtype = hdbscan_lsi_job
92 super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs)
95 def namer(self, *args, **kwargs):
96 s = hdbscan_grid_sweep.namer(self, *args[1:], **kwargs)
97 s += f"_lsi-{self.lsi_dim}"
101 class hdbscan_clustering_result(clustering_result):
104 cluster_selection_epsilon:float
105 cluster_selection_method:str
108 class hdbscan_clustering_result_lsi(hdbscan_clustering_result, lsi_result_mixin):
111 class hdbscan_job(clustering_job):
112 def __init__(self, infile, outpath, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
113 super().__init__(infile,
116 call=hdbscan_job._hdbscan_clustering,
117 min_cluster_size=min_cluster_size,
118 min_samples=min_samples,
119 cluster_selection_epsilon=cluster_selection_epsilon,
120 cluster_selection_method=cluster_selection_method
123 self.min_cluster_size = min_cluster_size
124 self.min_samples = min_samples
125 self.cluster_selection_epsilon = cluster_selection_epsilon
126 self.cluster_selection_method = cluster_selection_method
127 # self.mat = 1 - self.mat
129 def _hdbscan_clustering(mat, *args, **kwargs):
130 print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}")
132 clusterer = hdbscan.HDBSCAN(metric='precomputed',
133 core_dist_n_jobs=cpu_count(),
138 clustering = clusterer.fit(mat.astype('double'))
143 result = super().get_info()
144 self.result = hdbscan_clustering_result(**result.__dict__,
145 min_cluster_size=self.min_cluster_size,
146 min_samples=self.min_samples,
147 cluster_selection_epsilon=self.cluster_selection_epsilon,
148 cluster_selection_method=self.cluster_selection_method)
151 class hdbscan_lsi_job(hdbscan_job, lsi_mixin):
152 def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
159 super().set_lsi_dims(lsi_dims)
162 partial_result = super().get_info()
163 self.result = hdbscan_clustering_result_lsi(**partial_result.__dict__,
164 lsi_dimensions=self.lsi_dims)
167 # def select_hdbscan_clustering(inpath,
170 # min_cluster_sizes=[2],
172 # cluster_selection_epsilons=[0],
173 # cluster_selection_methods=['eom'],
174 # lsi_dimensions='all'
177 # inpath = Path(inpath)
178 # outpath = Path(outpath)
179 # outpath.mkdir(exist_ok=True, parents=True)
181 # if lsi_dimensions is None:
182 # lsi_paths = [inpath]
183 # elif lsi_dimensions == 'all':
184 # lsi_paths = list(inpath.glob("*"))
187 # lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions]
189 # if lsi_dimensions is not None:
190 # lsi_nums = [p.stem for p in lsi_paths]
193 # grid = list(product(lsi_nums,
196 # cluster_selection_epsilons,
197 # cluster_selection_methods))
199 # # fix the output file names
200 # names = list(map(lambda t:'_'.join(map(str,t)),grid))
202 # grid = [(inpath/(str(t[0])+'.feather'),outpath/(name + '.feather'), t[0], name) + t[1:] for t, name in zip(grid, names)]
204 # with Pool(int(cpu_count()/4)) as pool:
205 # mods = starmap(hdbscan_clustering, grid)
207 # res = pd.DataFrame(mods)
208 # if outfile is None:
209 # outfile = outpath / "selection_data.csv"
211 # res.to_csv(outfile)
213 # def hdbscan_clustering(similarities, output, lsi_dim, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
214 # subreddits, mat = read_similarity_mat(similarities)
215 # mat = sim_to_dist(mat)
216 # clustering = _hdbscan_clustering(mat,
217 # min_cluster_size=min_cluster_size,
218 # min_samples=min_samples,
219 # cluster_selection_epsilon=cluster_selection_epsilon,
220 # cluster_selection_method=cluster_selection_method,
221 # metric='precomputed',
222 # core_dist_n_jobs=cpu_count()
225 # cluster_data = process_clustering_result(clustering, subreddits)
226 # isolates = clustering.labels_ == -1
227 # scoremat = mat[~isolates][:,~isolates]
228 # score = silhouette_score(scoremat, clustering.labels_[~isolates], metric='precomputed')
229 # cluster_data.to_feather(output)
230 # silhouette_samp = silhouette_samples(mat, clustering.labels_, metric='precomputed')
231 # silhouette_samp = pd.DataFrame({'subreddit':subreddits,'score':silhouette_samp})
232 # silsampout = output.parent / ("silhouette_samples" + output.name)
233 # silhouette_samp.to_feather(silsampout)
235 # result = hdbscan_clustering_result(outpath=output,
236 # silhouette_samples=silsampout,
237 # silhouette_score=score,
239 # min_cluster_size=min_cluster_size,
240 # min_samples=min_samples,
241 # cluster_selection_epsilon=cluster_selection_epsilon,
242 # cluster_selection_method=cluster_selection_method,
243 # lsi_dimensions=lsi_dim,
244 # n_isolates=isolates.sum(),
245 # n_clusters=len(set(clustering.labels_))
252 # # for all runs we should try cluster_selection_epsilon = None
253 # # for terms we should try cluster_selection_epsilon around 0.56-0.66
254 # # for authors we should try cluster_selection_epsilon around 0.98-0.99
255 # def _hdbscan_clustering(mat, *args, **kwargs):
256 # print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}")
259 # clusterer = hdbscan.HDBSCAN(*args,
263 # clustering = clusterer.fit(mat.astype('double'))
267 def KNN_distances_plot(mat,outname,k=2):
268 nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat)
269 distances, indices = nbrs.kneighbors(mat)
271 df = pd.DataFrame({'dist':d2})
272 df = df.sort_values("dist",ascending=False)
273 df['idx'] = np.arange(0,d2.shape[0]) + 1
274 p = pn.qplot(x='idx',y='dist',data=df,geom='line') + pn.scales.scale_y_continuous(minor_breaks = np.arange(0,50)/50,
275 breaks = np.arange(0,10)/10)
276 p.save(outname,width=16,height=10)
278 def make_KNN_plots():
279 similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_10k.feather"
280 subreddits, mat = read_similarity_mat(similarities)
281 mat = sim_to_dist(mat)
283 KNN_distances_plot(mat,k=2,outname='terms_knn_dist2.png')
285 similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10k.feather"
286 subreddits, mat = read_similarity_mat(similarities)
287 mat = sim_to_dist(mat)
288 KNN_distances_plot(mat,k=2,outname='authors_knn_dist2.png')
290 similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k.feather"
291 subreddits, mat = read_similarity_mat(similarities)
292 mat = sim_to_dist(mat)
293 KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png')
295 if __name__ == "__main__":
296 fire.Fire{'grid_sweep':hdbscan_grid_sweep,
297 'grid_sweep_lsi':hdbscan_lsi_grid_sweep
298 'cluster':hdbscan_job,
299 'cluster_lsi':hdbscan_lsi_job}
301 # test_select_hdbscan_clustering()
302 #fire.Fire(select_hdbscan_clustering)