X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/blobdiff_plain/47ba04aa9715325a67fe17cee205230b042022fe..8a2248fae1ee5818576b9a8f2849d1ad0efd8187:/clustering/hdbscan_clustering.py?ds=inline diff --git a/clustering/hdbscan_clustering.py b/clustering/hdbscan_clustering.py new file mode 100644 index 0000000..f0ee703 --- /dev/null +++ b/clustering/hdbscan_clustering.py @@ -0,0 +1,302 @@ +from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat +from clustering_base import lsi_result_mixin, lsi_mixin, clustering_job, grid_sweep, lsi_grid_sweep +from dataclasses import dataclass +import hdbscan +from sklearn.neighbors import NearestNeighbors +import plotnine as pn +import numpy as np +from itertools import product, starmap, chain +import pandas as pd +from sklearn.metrics import silhouette_score, silhouette_samples +from pathlib import Path +from multiprocessing import Pool, cpu_count +import fire +from pyarrow.feather import write_feather + +def test_select_hdbscan_clustering(): + # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI", + # "test_hdbscan_author30k", + # min_cluster_sizes=[2], + # min_samples=[1,2], + # cluster_selection_epsilons=[0,0.05,0.1,0.15], + # cluster_selection_methods=['eom','leaf'], + # lsi_dimensions='all') + inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/" + outpath = "test_hdbscan"; + min_cluster_sizes=[2,3,4]; + min_samples=[1,2,3]; + cluster_selection_epsilons=[0,0.1,0.3,0.5]; + cluster_selection_methods=['eom']; + lsi_dimensions='all' + gs = hdbscan_lsi_grid_sweep(inpath, "all", outpath, min_cluster_sizes, min_samples, cluster_selection_epsilons, cluster_selection_methods) + gs.run(20) + gs.save("test_hdbscan/lsi_sweep.csv") + # job1 = hdbscan_lsi_job(infile=inpath, outpath=outpath, name="test", lsi_dims=500, min_cluster_size=2, min_samples=1,cluster_selection_epsilon=0,cluster_selection_method='eom') + # job1.run() + # print(job1.get_info()) + + # df = pd.read_csv("test_hdbscan/selection_data.csv") + # test_select_hdbscan_clustering() + # check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather") + # silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather") + # c = check_clusters.merge(silscores,on='subreddit')# fire.Fire(select_hdbscan_clustering) + +class hdbscan_lsi_grid_sweep(lsi_grid_sweep): + def __init__(self, + inpath, + lsi_dims, + outpath, + min_cluster_sizes, + min_samples, + cluster_selection_epsilons, + cluster_selection_methods + ): + + super().__init__(hdbscan_lsi_job, + _hdbscan_lsi_grid_sweep, + inpath, + lsi_dims, + outpath, + min_cluster_sizes, + min_samples, + cluster_selection_epsilons, + cluster_selection_methods) + +class hdbscan_grid_sweep(grid_sweep): + def __init__(self, + inpath, + outpath, + *args, + **kwargs): + + super().__init__(hdbscan_job, inpath, outpath, self.namer, *args, **kwargs) + + def namer(self, + min_cluster_size, + min_samples, + cluster_selection_epsilon, + cluster_selection_method): + return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}" + + +class _hdbscan_lsi_grid_sweep(grid_sweep): + def __init__(self, + inpath, + outpath, + lsi_dim, + *args, + **kwargs): + + self.lsi_dim = lsi_dim + self.jobtype = hdbscan_lsi_job + super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs) + + + def namer(self, *args, **kwargs): + s = hdbscan_grid_sweep.namer(self, *args[1:], **kwargs) + s += f"_lsi-{self.lsi_dim}" + return s + +@dataclass +class hdbscan_clustering_result(clustering_result): + min_cluster_size:int + min_samples:int + cluster_selection_epsilon:float + cluster_selection_method:str + +@dataclass +class hdbscan_clustering_result_lsi(hdbscan_clustering_result, lsi_result_mixin): + pass + +class hdbscan_job(clustering_job): + def __init__(self, infile, outpath, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'): + super().__init__(infile, + outpath, + name, + call=hdbscan_job._hdbscan_clustering, + min_cluster_size=min_cluster_size, + min_samples=min_samples, + cluster_selection_epsilon=cluster_selection_epsilon, + cluster_selection_method=cluster_selection_method + ) + + self.min_cluster_size = min_cluster_size + self.min_samples = min_samples + self.cluster_selection_epsilon = cluster_selection_epsilon + self.cluster_selection_method = cluster_selection_method +# self.mat = 1 - self.mat + + def _hdbscan_clustering(mat, *args, **kwargs): + print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}") + print(mat) + clusterer = hdbscan.HDBSCAN(metric='precomputed', + core_dist_n_jobs=cpu_count(), + *args, + **kwargs, + ) + + clustering = clusterer.fit(mat.astype('double')) + + return(clustering) + + def get_info(self): + result = super().get_info() + self.result = hdbscan_clustering_result(**result.__dict__, + min_cluster_size=self.min_cluster_size, + min_samples=self.min_samples, + cluster_selection_epsilon=self.cluster_selection_epsilon, + cluster_selection_method=self.cluster_selection_method) + return self.result + +class hdbscan_lsi_job(hdbscan_job, lsi_mixin): + def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs): + super().__init__( + infile, + outpath, + name, + *args, + **kwargs) + super().set_lsi_dims(lsi_dims) + + def get_info(self): + partial_result = super().get_info() + self.result = hdbscan_clustering_result_lsi(**partial_result.__dict__, + lsi_dimensions=self.lsi_dims) + return self.result + +# def select_hdbscan_clustering(inpath, +# outpath, +# outfile=None, +# min_cluster_sizes=[2], +# min_samples=[1], +# cluster_selection_epsilons=[0], +# cluster_selection_methods=['eom'], +# lsi_dimensions='all' +# ): + +# inpath = Path(inpath) +# outpath = Path(outpath) +# outpath.mkdir(exist_ok=True, parents=True) + +# if lsi_dimensions is None: +# lsi_paths = [inpath] +# elif lsi_dimensions == 'all': +# lsi_paths = list(inpath.glob("*")) + +# else: +# lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions] + +# if lsi_dimensions is not None: +# lsi_nums = [p.stem for p in lsi_paths] +# else: +# lsi_nums = [None] +# grid = list(product(lsi_nums, +# min_cluster_sizes, +# min_samples, +# cluster_selection_epsilons, +# cluster_selection_methods)) + +# # fix the output file names +# names = list(map(lambda t:'_'.join(map(str,t)),grid)) + +# grid = [(inpath/(str(t[0])+'.feather'),outpath/(name + '.feather'), t[0], name) + t[1:] for t, name in zip(grid, names)] + +# with Pool(int(cpu_count()/4)) as pool: +# mods = starmap(hdbscan_clustering, grid) + +# res = pd.DataFrame(mods) +# if outfile is None: +# outfile = outpath / "selection_data.csv" + +# res.to_csv(outfile) + +# def hdbscan_clustering(similarities, output, lsi_dim, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'): +# subreddits, mat = read_similarity_mat(similarities) +# mat = sim_to_dist(mat) +# clustering = _hdbscan_clustering(mat, +# min_cluster_size=min_cluster_size, +# min_samples=min_samples, +# cluster_selection_epsilon=cluster_selection_epsilon, +# cluster_selection_method=cluster_selection_method, +# metric='precomputed', +# core_dist_n_jobs=cpu_count() +# ) + +# cluster_data = process_clustering_result(clustering, subreddits) +# isolates = clustering.labels_ == -1 +# scoremat = mat[~isolates][:,~isolates] +# score = silhouette_score(scoremat, clustering.labels_[~isolates], metric='precomputed') +# cluster_data.to_feather(output) +# silhouette_samp = silhouette_samples(mat, clustering.labels_, metric='precomputed') +# silhouette_samp = pd.DataFrame({'subreddit':subreddits,'score':silhouette_samp}) +# silsampout = output.parent / ("silhouette_samples" + output.name) +# silhouette_samp.to_feather(silsampout) + +# result = hdbscan_clustering_result(outpath=output, +# silhouette_samples=silsampout, +# silhouette_score=score, +# name=name, +# min_cluster_size=min_cluster_size, +# min_samples=min_samples, +# cluster_selection_epsilon=cluster_selection_epsilon, +# cluster_selection_method=cluster_selection_method, +# lsi_dimensions=lsi_dim, +# n_isolates=isolates.sum(), +# n_clusters=len(set(clustering.labels_)) +# ) + + + +# return(result) + +# # for all runs we should try cluster_selection_epsilon = None +# # for terms we should try cluster_selection_epsilon around 0.56-0.66 +# # for authors we should try cluster_selection_epsilon around 0.98-0.99 +# def _hdbscan_clustering(mat, *args, **kwargs): +# print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}") + +# print(mat) +# clusterer = hdbscan.HDBSCAN(*args, +# **kwargs, +# ) + +# clustering = clusterer.fit(mat.astype('double')) + +# return(clustering) + +def KNN_distances_plot(mat,outname,k=2): + nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat) + distances, indices = nbrs.kneighbors(mat) + d2 = distances[:,-1] + df = pd.DataFrame({'dist':d2}) + df = df.sort_values("dist",ascending=False) + df['idx'] = np.arange(0,d2.shape[0]) + 1 + p = pn.qplot(x='idx',y='dist',data=df,geom='line') + pn.scales.scale_y_continuous(minor_breaks = np.arange(0,50)/50, + breaks = np.arange(0,10)/10) + p.save(outname,width=16,height=10) + +def make_KNN_plots(): + similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_10k.feather" + subreddits, mat = read_similarity_mat(similarities) + mat = sim_to_dist(mat) + + KNN_distances_plot(mat,k=2,outname='terms_knn_dist2.png') + + similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10k.feather" + subreddits, mat = read_similarity_mat(similarities) + mat = sim_to_dist(mat) + KNN_distances_plot(mat,k=2,outname='authors_knn_dist2.png') + + similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k.feather" + subreddits, mat = read_similarity_mat(similarities) + mat = sim_to_dist(mat) + KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png') + +if __name__ == "__main__": + fire.Fire{'grid_sweep':hdbscan_grid_sweep, + 'grid_sweep_lsi':hdbscan_lsi_grid_sweep + 'cluster':hdbscan_job, + 'cluster_lsi':hdbscan_lsi_job} + +# test_select_hdbscan_clustering() + #fire.Fire(select_hdbscan_clustering)