]> code.communitydata.science - cdsc_reddit.git/blob - clustering/affinity_clustering.py
Merge remote-tracking branch 'origin/excise_reindex' into temp
[cdsc_reddit.git] / clustering / affinity_clustering.py
1 from sklearn.metrics import silhouette_score
2 from sklearn.cluster import AffinityPropagation
3 from functools import partial
4 from dataclasses import dataclass
5 from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat
6 from clustering_base import lsi_result_mixin, lsi_mixin, clustering_job, grid_sweep, lsi_grid_sweep
7 from multiprocessing  import Pool, cpu_count, Array, Process
8 from pathlib import Path
9 from itertools import product, starmap
10 import numpy as np
11 import pandas as pd
12 import fire
13 import sys
14
15 # silhouette is the only one that doesn't need the feature matrix. So it's probably the only one that's worth trying. 
16 @dataclass
17 class affinity_clustering_result(clustering_result):
18     damping:float
19     convergence_iter:int
20     preference_quantile:float
21     preference:float
22     max_iter:int
23
24 @dataclass
25 class affinity_clustering_result_lsi(affinity_clustering_result, lsi_result_mixin):
26     pass
27
28 class affinity_job(clustering_job):
29     def __init__(self, infile, outpath, name, damping=0.9, max_iter=100000, convergence_iter=30, preference_quantile=0.5, random_state=1968, verbose=True):
30         super().__init__(infile,
31                          outpath,
32                          name,
33                          call=self._affinity_clustering,
34                          preference_quantile=preference_quantile,
35                          damping=damping,
36                          max_iter=max_iter,
37                          convergence_iter=convergence_iter,
38                          random_state=1968,
39                          verbose=verbose)
40         self.damping=damping
41         self.max_iter=max_iter
42         self.convergence_iter=convergence_iter
43         self.preference_quantile=preference_quantile
44
45     def _affinity_clustering(self, mat, preference_quantile, *args, **kwargs):
46         mat = 1-mat
47         preference = np.quantile(mat, preference_quantile)
48         self.preference = preference
49         print(f"preference is {preference}")
50         print("data loaded")
51         sys.stdout.flush()
52         clustering = AffinityPropagation(*args,
53                                          preference=preference,
54                                          affinity='precomputed',
55                                          copy=False,
56                                          **kwargs).fit(mat)
57         return clustering
58
59     def get_info(self):
60         result = super().get_info()
61         self.result=affinity_clustering_result(**result.__dict__,
62                                                damping=self.damping,
63                                                max_iter=self.max_iter,
64                                                convergence_iter=self.convergence_iter,
65                                                preference_quantile=self.preference_quantile,
66                                                preference=self.preference)
67
68         return self.result
69
70 class affinity_lsi_job(affinity_job, lsi_mixin):
71     def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
72         super().__init__(infile,
73                          outpath,
74                          name,
75                          *args,
76                          **kwargs)
77         super().set_lsi_dims(lsi_dims)
78
79     def get_info(self):
80         result = super().get_info()
81         self.result = affinity_clustering_result_lsi(**result.__dict__,
82                                                      lsi_dimensions=self.lsi_dims)
83         return self.result
84
85 class affinity_grid_sweep(grid_sweep):
86     def __init__(self,
87                  inpath,
88                  outpath,
89                  *args,
90                  **kwargs):
91
92         super().__init__(affinity_job,
93                          _afffinity_grid_sweep,
94                          inpath,
95                          outpath,
96                          self.namer,
97                          *args,
98                          **kwargs)
99     def namer(self,
100               damping,
101               max_iter,
102               convergence_iter,
103               preference_quantile):
104
105         return f"damp-{damping}_maxit-{max_iter}_convit-{convergence_iter}_prefq-{preference_quantile}"
106
107 class _affinity_lsi_grid_sweep(grid_sweep):
108     def __init__(self,
109                  inpath,
110                  outpath,
111                  lsi_dim,
112                  *args,
113                  **kwargs):
114         self.lsi_dim = lsi_dim
115         self.jobtype = affinity_lsi_job
116         super().__init__(self.jobtype,
117                          inpath,
118                          outpath,
119                          self.namer,
120                          self.lsi_dim,
121                          *args,
122                          **kwargs)
123
124     def namer(self, *args, **kwargs):
125         s = affinity_grid_sweep.namer(self, *args[1:], **kwargs)
126         s += f"_lsi-{self.lsi_dim}"
127         return s
128
129 class affinity_lsi_grid_sweep(lsi_grid_sweep):
130     def __init__(self,
131                  inpath,
132                  lsi_dims,
133                  outpath,
134                  dampings=[0.9],
135                  max_iters=[10000],
136                  convergence_iters=[30],
137                  preference_quantiles=[0.5]):
138
139         super().__init__(affinity_lsi_job,
140                          _affinity_lsi_grid_sweep,
141                          inpath,
142                          lsi_dims,
143                          outpath,
144                          dampings,
145                          max_iters,
146                          convergence_iters,
147                          preference_quantiles)
148     
149                          
150     
151 def test_select_affinity_clustering():
152     # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
153     #                           "test_hdbscan_author30k",
154     #                           min_cluster_sizes=[2],
155     #                           min_samples=[1,2],
156     #                           cluster_selection_epsilons=[0,0.05,0.1,0.15],
157     #                           cluster_selection_methods=['eom','leaf'],
158     #                           lsi_dimensions='all')
159     inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/"
160     outpath = "test_affinity";
161     dampings=[0.8,0.9]
162     max_iters=[100000]
163     convergence_iters=[15]
164     preference_quantiles=[0.5,0.7]
165     
166     gs = affinity_lsi_grid_sweep(inpath, 'all', outpath, dampings, max_iters, convergence_iters, preference_quantiles)
167     gs.run(20)
168     gs.save("test_affinity/lsi_sweep.csv")
169
170
171 if __name__ == "__main__":
172     fire.Fire{'grid_sweep':affinity_grid_sweep,
173               'grid_sweep_lsi':affinity_lsi_grid_sweep
174               'cluster':affinity_job,
175               'cluster_lsi':affinity_lsi_job}

Community Data Science Collective || Want to submit a patch?