X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/blobdiff_plain/47ba04aa9715325a67fe17cee205230b042022fe..8a2248fae1ee5818576b9a8f2849d1ad0efd8187:/clustering/affinity_clustering.py?ds=inline diff --git a/clustering/affinity_clustering.py b/clustering/affinity_clustering.py new file mode 100644 index 0000000..b4f8461 --- /dev/null +++ b/clustering/affinity_clustering.py @@ -0,0 +1,175 @@ +from sklearn.metrics import silhouette_score +from sklearn.cluster import AffinityPropagation +from functools import partial +from dataclasses import dataclass +from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat +from clustering_base import lsi_result_mixin, lsi_mixin, clustering_job, grid_sweep, lsi_grid_sweep +from multiprocessing import Pool, cpu_count, Array, Process +from pathlib import Path +from itertools import product, starmap +import numpy as np +import pandas as pd +import fire +import sys + +# silhouette is the only one that doesn't need the feature matrix. So it's probably the only one that's worth trying. +@dataclass +class affinity_clustering_result(clustering_result): + damping:float + convergence_iter:int + preference_quantile:float + preference:float + max_iter:int + +@dataclass +class affinity_clustering_result_lsi(affinity_clustering_result, lsi_result_mixin): + pass + +class affinity_job(clustering_job): + def __init__(self, infile, outpath, name, damping=0.9, max_iter=100000, convergence_iter=30, preference_quantile=0.5, random_state=1968, verbose=True): + super().__init__(infile, + outpath, + name, + call=self._affinity_clustering, + preference_quantile=preference_quantile, + damping=damping, + max_iter=max_iter, + convergence_iter=convergence_iter, + random_state=1968, + verbose=verbose) + self.damping=damping + self.max_iter=max_iter + self.convergence_iter=convergence_iter + self.preference_quantile=preference_quantile + + def _affinity_clustering(self, mat, preference_quantile, *args, **kwargs): + mat = 1-mat + preference = np.quantile(mat, preference_quantile) + self.preference = preference + print(f"preference is {preference}") + print("data loaded") + sys.stdout.flush() + clustering = AffinityPropagation(*args, + preference=preference, + affinity='precomputed', + copy=False, + **kwargs).fit(mat) + return clustering + + def get_info(self): + result = super().get_info() + self.result=affinity_clustering_result(**result.__dict__, + damping=self.damping, + max_iter=self.max_iter, + convergence_iter=self.convergence_iter, + preference_quantile=self.preference_quantile, + preference=self.preference) + + return self.result + +class affinity_lsi_job(affinity_job, lsi_mixin): + def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs): + super().__init__(infile, + outpath, + name, + *args, + **kwargs) + super().set_lsi_dims(lsi_dims) + + def get_info(self): + result = super().get_info() + self.result = affinity_clustering_result_lsi(**result.__dict__, + lsi_dimensions=self.lsi_dims) + return self.result + +class affinity_grid_sweep(grid_sweep): + def __init__(self, + inpath, + outpath, + *args, + **kwargs): + + super().__init__(affinity_job, + _afffinity_grid_sweep, + inpath, + outpath, + self.namer, + *args, + **kwargs) + def namer(self, + damping, + max_iter, + convergence_iter, + preference_quantile): + + return f"damp-{damping}_maxit-{max_iter}_convit-{convergence_iter}_prefq-{preference_quantile}" + +class _affinity_lsi_grid_sweep(grid_sweep): + def __init__(self, + inpath, + outpath, + lsi_dim, + *args, + **kwargs): + self.lsi_dim = lsi_dim + self.jobtype = affinity_lsi_job + super().__init__(self.jobtype, + inpath, + outpath, + self.namer, + self.lsi_dim, + *args, + **kwargs) + + def namer(self, *args, **kwargs): + s = affinity_grid_sweep.namer(self, *args[1:], **kwargs) + s += f"_lsi-{self.lsi_dim}" + return s + +class affinity_lsi_grid_sweep(lsi_grid_sweep): + def __init__(self, + inpath, + lsi_dims, + outpath, + dampings=[0.9], + max_iters=[10000], + convergence_iters=[30], + preference_quantiles=[0.5]): + + super().__init__(affinity_lsi_job, + _affinity_lsi_grid_sweep, + inpath, + lsi_dims, + outpath, + dampings, + max_iters, + convergence_iters, + preference_quantiles) + + + +def test_select_affinity_clustering(): + # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI", + # "test_hdbscan_author30k", + # min_cluster_sizes=[2], + # min_samples=[1,2], + # cluster_selection_epsilons=[0,0.05,0.1,0.15], + # cluster_selection_methods=['eom','leaf'], + # lsi_dimensions='all') + inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/" + outpath = "test_affinity"; + dampings=[0.8,0.9] + max_iters=[100000] + convergence_iters=[15] + preference_quantiles=[0.5,0.7] + + gs = affinity_lsi_grid_sweep(inpath, 'all', outpath, dampings, max_iters, convergence_iters, preference_quantiles) + gs.run(20) + gs.save("test_affinity/lsi_sweep.csv") + + +if __name__ == "__main__": + fire.Fire{'grid_sweep':affinity_grid_sweep, + 'grid_sweep_lsi':affinity_lsi_grid_sweep + 'cluster':affinity_job, + 'cluster_lsi':affinity_lsi_job}