X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/blobdiff_plain/7df8436067dba9a9e6867424002d01593e4bcd25..cf86c7492c27801638fcccabd2a39e3213e47cc9:/clustering/hdbscan_clustering.py?ds=sidebyside diff --git a/clustering/hdbscan_clustering.py b/clustering/hdbscan_clustering.py index 888554a..e533808 100644 --- a/clustering/hdbscan_clustering.py +++ b/clustering/hdbscan_clustering.py @@ -1,32 +1,57 @@ -from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat +from clustering_base import clustering_result, clustering_job +from grid_sweep import grid_sweep from dataclasses import dataclass import hdbscan from sklearn.neighbors import NearestNeighbors import plotnine as pn import numpy as np -from itertools import product, starmap +from itertools import product, starmap, chain import pandas as pd -from sklearn.metrics import silhouette_score, silhouette_samples -from pathlib import Path -from multiprocessing import Pool, cpu_count +from multiprocessing import cpu_count import fire -from pyarrow.feather import write_feather def test_select_hdbscan_clustering(): - select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI", - "test_hdbscan_author30k", - min_cluster_sizes=[2], - min_samples=[1,2], - cluster_selection_epsilons=[0,0.05,0.1,0.15], - cluster_selection_methods=['eom','leaf'], - lsi_dimensions='all') - inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI" + # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI", + # "test_hdbscan_author30k", + # min_cluster_sizes=[2], + # min_samples=[1,2], + # cluster_selection_epsilons=[0,0.05,0.1,0.15], + # cluster_selection_methods=['eom','leaf'], + # lsi_dimensions='all') + inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/" outpath = "test_hdbscan"; min_cluster_sizes=[2,3,4]; min_samples=[1,2,3]; cluster_selection_epsilons=[0,0.1,0.3,0.5]; cluster_selection_methods=['eom']; lsi_dimensions='all' + gs = hdbscan_lsi_grid_sweep(inpath, "all", outpath, min_cluster_sizes, min_samples, cluster_selection_epsilons, cluster_selection_methods) + gs.run(20) + gs.save("test_hdbscan/lsi_sweep.csv") + # job1 = hdbscan_lsi_job(infile=inpath, outpath=outpath, name="test", lsi_dims=500, min_cluster_size=2, min_samples=1,cluster_selection_epsilon=0,cluster_selection_method='eom') + # job1.run() + # print(job1.get_info()) + + # df = pd.read_csv("test_hdbscan/selection_data.csv") + # test_select_hdbscan_clustering() + # check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather") + # silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather") + # c = check_clusters.merge(silscores,on='subreddit')# fire.Fire(select_hdbscan_clustering) +class hdbscan_grid_sweep(grid_sweep): + def __init__(self, + inpath, + outpath, + *args, + **kwargs): + + super().__init__(hdbscan_job, inpath, outpath, self.namer, *args, **kwargs) + + def namer(self, + min_cluster_size, + min_samples, + cluster_selection_epsilon, + cluster_selection_method): + return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}" @dataclass class hdbscan_clustering_result(clustering_result): @@ -34,107 +59,70 @@ class hdbscan_clustering_result(clustering_result): min_samples:int cluster_selection_epsilon:float cluster_selection_method:str - lsi_dimensions:int - n_isolates:int - silhouette_samples:str - -def select_hdbscan_clustering(inpath, - outpath, - outfile=None, - min_cluster_sizes=[2], - min_samples=[1], - cluster_selection_epsilons=[0], - cluster_selection_methods=['eom'], - lsi_dimensions='all' - ): - - inpath = Path(inpath) - outpath = Path(outpath) - outpath.mkdir(exist_ok=True, parents=True) - - if lsi_dimensions == 'all': - lsi_paths = list(inpath.glob("*")) - - else: - lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions] - - lsi_nums = [p.stem for p in lsi_paths] - grid = list(product(lsi_nums, - min_cluster_sizes, - min_samples, - cluster_selection_epsilons, - cluster_selection_methods)) - - # fix the output file names - names = list(map(lambda t:'_'.join(map(str,t)),grid)) - - grid = [(inpath/(str(t[0])+'.feather'),outpath/(name + '.feather'), t[0], name) + t[1:] for t, name in zip(grid, names)] - - with Pool(int(cpu_count()/4)) as pool: - mods = starmap(hdbscan_clustering, grid) - - res = pd.DataFrame(mods) - if outfile is None: - outfile = outpath / "selection_data.csv" - res.to_csv(outfile) - -def hdbscan_clustering(similarities, output, lsi_dim, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'): - subreddits, mat = read_similarity_mat(similarities) - mat = sim_to_dist(mat) - clustering = _hdbscan_clustering(mat, - min_cluster_size=min_cluster_size, - min_samples=min_samples, - cluster_selection_epsilon=cluster_selection_epsilon, - cluster_selection_method=cluster_selection_method, - metric='precomputed', - core_dist_n_jobs=cpu_count() - ) - - cluster_data = process_clustering_result(clustering, subreddits) - isolates = clustering.labels_ == -1 - scoremat = mat[~isolates][:,~isolates] - score = silhouette_score(scoremat, clustering.labels_[~isolates], metric='precomputed') - cluster_data.to_feather(output) - - silhouette_samp = silhouette_samples(mat, clustering.labels_, metric='precomputed') - silhouette_samp = pd.DataFrame({'subreddit':subreddits,'score':silhouette_samp}) - silsampout = output.parent / ("silhouette_samples" + output.name) - silhouette_samp.to_feather(silsampout) - - result = hdbscan_clustering_result(outpath=output, - max_iter=None, - silhouette_samples=silsampout, - silhouette_score=score, - alt_silhouette_score=score, - name=name, - min_cluster_size=min_cluster_size, - min_samples=min_samples, - cluster_selection_epsilon=cluster_selection_epsilon, - cluster_selection_method=cluster_selection_method, - lsi_dimensions=lsi_dim, - n_isolates=isolates.sum(), - n_clusters=len(set(clustering.labels_)) - ) - - - - return(result) - -# for all runs we should try cluster_selection_epsilon = None -# for terms we should try cluster_selection_epsilon around 0.56-0.66 -# for authors we should try cluster_selection_epsilon around 0.98-0.99 -def _hdbscan_clustering(mat, *args, **kwargs): - print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}") - - print(mat) - clusterer = hdbscan.HDBSCAN(*args, - **kwargs, - ) +class hdbscan_job(clustering_job): + def __init__(self, infile, outpath, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'): + super().__init__(infile, + outpath, + name, + call=hdbscan_job._hdbscan_clustering, + min_cluster_size=min_cluster_size, + min_samples=min_samples, + cluster_selection_epsilon=cluster_selection_epsilon, + cluster_selection_method=cluster_selection_method + ) + + self.min_cluster_size = min_cluster_size + self.min_samples = min_samples + self.cluster_selection_epsilon = cluster_selection_epsilon + self.cluster_selection_method = cluster_selection_method +# self.mat = 1 - self.mat + + def _hdbscan_clustering(mat, *args, **kwargs): + print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}") + print(mat) + clusterer = hdbscan.HDBSCAN(metric='precomputed', + core_dist_n_jobs=cpu_count(), + *args, + **kwargs, + ) - clustering = clusterer.fit(mat.astype('double')) + clustering = clusterer.fit(mat.astype('double')) - return(clustering) + return(clustering) + + def get_info(self): + result = super().get_info() + self.result = hdbscan_clustering_result(**result.__dict__, + min_cluster_size=self.min_cluster_size, + min_samples=self.min_samples, + cluster_selection_epsilon=self.cluster_selection_epsilon, + cluster_selection_method=self.cluster_selection_method) + return self.result + +def run_hdbscan_grid_sweep(savefile, inpath, outpath, min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom']): + """Run hdbscan clustering once or more with different parameters. + + Usage: + hdbscan_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --min_cluster_sizes= --min_samples= --cluster_selection_epsilons= --cluster_selection_methods= + + Keword arguments: + savefile: path to save the metadata and diagnostics + inpath: path to feather data containing a labeled matrix of subreddit similarities. + outpath: path to output fit kmeans clusterings. + min_cluster_sizes: one or more integers indicating the minumum cluster size + min_samples: one ore more integers indicating the minimum number of samples used in the algorithm + cluster_selection_epsilon: one or more similarity thresholds for transition from dbscan to hdbscan + cluster_selection_method: "eom" or "leaf" eom gives larger clusters. + """ + obj = hdbscan_grid_sweep(inpath, + outpath, + map(int,min_cluster_sizes), + map(int,min_samples), + map(float,cluster_selection_epsilons), + map(float,cluster_selection_methods)) + obj.run() + obj.save(savefile) def KNN_distances_plot(mat,outname,k=2): nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat) @@ -165,8 +153,7 @@ def make_KNN_plots(): KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png') if __name__ == "__main__": - df = pd.read_csv("test_hdbscan/selection_data.csv") - test_select_hdbscan_clustering() - check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather") - silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather") - c = check_clusters.merge(silscores,on='subreddit')# fire.Fire(select_hdbscan_clustering) + fire.Fire(run_hdbscan_grid_sweep) + +# test_select_hdbscan_clustering() + #fire.Fire(select_hdbscan_clustering)