1 from sklearn.metrics import silhouette_score
2 from sklearn.cluster import AffinityPropagation
3 from functools import partial
4 from dataclasses import dataclass
5 from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat
6 from clustering_base import lsi_result_mixin, lsi_mixin, clustering_job, grid_sweep, lsi_grid_sweep
7 from multiprocessing import Pool, cpu_count, Array, Process
8 from pathlib import Path
9 from itertools import product, starmap
15 # silhouette is the only one that doesn't need the feature matrix. So it's probably the only one that's worth trying.
17 class affinity_clustering_result(clustering_result):
20 preference_quantile:float
25 class affinity_clustering_result_lsi(affinity_clustering_result, lsi_result_mixin):
28 class affinity_job(clustering_job):
29 def __init__(self, infile, outpath, name, damping=0.9, max_iter=100000, convergence_iter=30, preference_quantile=0.5, random_state=1968, verbose=True):
30 super().__init__(infile,
33 call=self._affinity_clustering,
34 preference_quantile=preference_quantile,
37 convergence_iter=convergence_iter,
41 self.max_iter=max_iter
42 self.convergence_iter=convergence_iter
43 self.preference_quantile=preference_quantile
45 def _affinity_clustering(self, mat, preference_quantile, *args, **kwargs):
47 preference = np.quantile(mat, preference_quantile)
48 self.preference = preference
49 print(f"preference is {preference}")
52 clustering = AffinityPropagation(*args,
53 preference=preference,
54 affinity='precomputed',
60 result = super().get_info()
61 self.result=affinity_clustering_result(**result.__dict__,
63 max_iter=self.max_iter,
64 convergence_iter=self.convergence_iter,
65 preference_quantile=self.preference_quantile,
66 preference=self.preference)
70 class affinity_lsi_job(affinity_job, lsi_mixin):
71 def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
72 super().__init__(infile,
77 super().set_lsi_dims(lsi_dims)
80 result = super().get_info()
81 self.result = affinity_clustering_result_lsi(**result.__dict__,
82 lsi_dimensions=self.lsi_dims)
85 class affinity_grid_sweep(grid_sweep):
92 super().__init__(affinity_job,
93 _afffinity_grid_sweep,
103 preference_quantile):
105 return f"damp-{damping}_maxit-{max_iter}_convit-{convergence_iter}_prefq-{preference_quantile}"
107 class _affinity_lsi_grid_sweep(grid_sweep):
114 self.lsi_dim = lsi_dim
115 self.jobtype = affinity_lsi_job
116 super().__init__(self.jobtype,
124 def namer(self, *args, **kwargs):
125 s = affinity_grid_sweep.namer(self, *args[1:], **kwargs)
126 s += f"_lsi-{self.lsi_dim}"
129 class affinity_lsi_grid_sweep(lsi_grid_sweep):
136 convergence_iters=[30],
137 preference_quantiles=[0.5]):
139 super().__init__(affinity_lsi_job,
140 _affinity_lsi_grid_sweep,
147 preference_quantiles)
151 def test_select_affinity_clustering():
152 # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
153 # "test_hdbscan_author30k",
154 # min_cluster_sizes=[2],
156 # cluster_selection_epsilons=[0,0.05,0.1,0.15],
157 # cluster_selection_methods=['eom','leaf'],
158 # lsi_dimensions='all')
159 inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/"
160 outpath = "test_affinity";
163 convergence_iters=[15]
164 preference_quantiles=[0.5,0.7]
166 gs = affinity_lsi_grid_sweep(inpath, 'all', outpath, dampings, max_iters, convergence_iters, preference_quantiles)
168 gs.save("test_affinity/lsi_sweep.csv")
171 if __name__ == "__main__":
172 fire.Fire{'grid_sweep':affinity_grid_sweep,
173 'grid_sweep_lsi':affinity_lsi_grid_sweep
174 'cluster':affinity_job,
175 'cluster_lsi':affinity_lsi_job}