]> code.communitydata.science - cdsc_reddit.git/blob - clustering/affinity_clustering_lsi.py
Merge branch 'master' of code:cdsc_reddit into excise_reindex
[cdsc_reddit.git] / clustering / affinity_clustering_lsi.py
1 import fire
2 from affinity_clustering import affinity_clustering_result, affinity_job, affinity_grid_sweep
3 from grid_sweep import grid_sweep
4 from lsi_base import lsi_result_mixin, lsi_grid_sweep, lsi_mixin
5 from dataclasses import dataclass
6
7 @dataclass
8 class affinity_clustering_result_lsi(affinity_clustering_result, lsi_result_mixin):
9     pass
10
11
12 class affinity_lsi_job(affinity_job, lsi_mixin):
13     def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
14         super().__init__(infile,
15                          outpath,
16                          name,
17                          *args,
18                          **kwargs)
19         super().set_lsi_dims(lsi_dims)
20
21     def get_info(self):
22         result = super().get_info()
23         self.result = affinity_clustering_result_lsi(**result.__dict__,
24                                                      lsi_dimensions=self.lsi_dims)
25         return self.result
26     
27 class affinity_lsi_grid_sweep(lsi_grid_sweep):
28     def __init__(self,
29                  inpath,
30                  lsi_dims,
31                  outpath,
32                  dampings=[0.9],
33                  max_iters=[10000],
34                  convergence_iters=[30],
35                  preference_quantiles=[0.5]):
36
37         super().__init__(affinity_lsi_job,
38                          _affinity_lsi_grid_sweep,
39                          inpath,
40                          lsi_dims,
41                          outpath,
42                          dampings,
43                          max_iters,
44                          convergence_iters,
45                          preference_quantiles)
46     
47
48 class _affinity_lsi_grid_sweep(grid_sweep):
49     def __init__(self,
50                  inpath,
51                  outpath,
52                  lsi_dim,
53                  *args,
54                  **kwargs):
55         self.lsi_dim = lsi_dim
56         self.jobtype = affinity_lsi_job
57         super().__init__(self.jobtype,
58                          inpath,
59                          outpath,
60                          self.namer,
61                          [self.lsi_dim],
62                          *args,
63                          **kwargs)
64
65     def namer(self, *args, **kwargs):
66         s = affinity_grid_sweep.namer(self, *args[1:], **kwargs)
67         s += f"_lsi-{self.lsi_dim}"
68         return s
69                          
70 def run_affinity_lsi_grid_sweep(savefile, inpath, outpath, dampings=[0.8], max_iters=[3000], convergence_iters=[30], preference_quantiles=[0.5], lsi_dimensions='all',n_cores=30):
71     """Run affinity clustering once or more with different parameters.
72     
73     Usage:
74     affinity_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --max_iters=<csv> --dampings=<csv> --preference_quantiles=<csv> --lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
75
76     Keword arguments:
77     savefile: path to save the metadata and diagnostics 
78     inpath: path to folder containing feather files with LSI similarity labeled matrices of subreddit similarities.
79     outpath: path to output fit kmeans clusterings.
80     dampings:one or more numbers in [0.5, 1). damping parameter in affinity propagatin clustering. 
81     preference_quantiles:one or more numbers in (0,1) for selecting the 'preference' parameter.
82     convergence_iters:one or more integers of number of iterations without improvement before stopping.
83     max_iters: one or more numbers of different maximum interations.
84     lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
85     """
86     
87     obj = affinity_lsi_grid_sweep(inpath,
88                             lsi_dimensions,
89                             outpath,
90                             map(float,dampings),
91                             map(int,max_iters),
92                             map(int,convergence_iters),
93                             map(float,preference_quantiles))
94
95     obj.run(n_cores)
96     obj.save(savefile)
97
98 if __name__ == "__main__":
99     fire.Fire(run_affinity_lsi_grid_sweep)

Community Data Science Collective || Want to submit a patch?