X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/blobdiff_plain/8a2248fae1ee5818576b9a8f2849d1ad0efd8187..7b14db67de8650e4858d3f102fbeab813a30ee29:/clustering/hdbscan_clustering.py?ds=inline diff --git a/clustering/hdbscan_clustering.py b/clustering/hdbscan_clustering.py index f0ee703..e533808 100644 --- a/clustering/hdbscan_clustering.py +++ b/clustering/hdbscan_clustering.py @@ -1,5 +1,5 @@ -from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat -from clustering_base import lsi_result_mixin, lsi_mixin, clustering_job, grid_sweep, lsi_grid_sweep +from clustering_base import clustering_result, clustering_job +from grid_sweep import grid_sweep from dataclasses import dataclass import hdbscan from sklearn.neighbors import NearestNeighbors @@ -7,11 +7,8 @@ import plotnine as pn import numpy as np from itertools import product, starmap, chain import pandas as pd -from sklearn.metrics import silhouette_score, silhouette_samples -from pathlib import Path -from multiprocessing import Pool, cpu_count +from multiprocessing import cpu_count import fire -from pyarrow.feather import write_feather def test_select_hdbscan_clustering(): # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI", @@ -40,28 +37,6 @@ def test_select_hdbscan_clustering(): # check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather") # silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather") # c = check_clusters.merge(silscores,on='subreddit')# fire.Fire(select_hdbscan_clustering) - -class hdbscan_lsi_grid_sweep(lsi_grid_sweep): - def __init__(self, - inpath, - lsi_dims, - outpath, - min_cluster_sizes, - min_samples, - cluster_selection_epsilons, - cluster_selection_methods - ): - - super().__init__(hdbscan_lsi_job, - _hdbscan_lsi_grid_sweep, - inpath, - lsi_dims, - outpath, - min_cluster_sizes, - min_samples, - cluster_selection_epsilons, - cluster_selection_methods) - class hdbscan_grid_sweep(grid_sweep): def __init__(self, inpath, @@ -78,25 +53,6 @@ class hdbscan_grid_sweep(grid_sweep): cluster_selection_method): return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}" - -class _hdbscan_lsi_grid_sweep(grid_sweep): - def __init__(self, - inpath, - outpath, - lsi_dim, - *args, - **kwargs): - - self.lsi_dim = lsi_dim - self.jobtype = hdbscan_lsi_job - super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs) - - - def namer(self, *args, **kwargs): - s = hdbscan_grid_sweep.namer(self, *args[1:], **kwargs) - s += f"_lsi-{self.lsi_dim}" - return s - @dataclass class hdbscan_clustering_result(clustering_result): min_cluster_size:int @@ -104,10 +60,6 @@ class hdbscan_clustering_result(clustering_result): cluster_selection_epsilon:float cluster_selection_method:str -@dataclass -class hdbscan_clustering_result_lsi(hdbscan_clustering_result, lsi_result_mixin): - pass - class hdbscan_job(clustering_job): def __init__(self, infile, outpath, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'): super().__init__(infile, @@ -148,121 +100,29 @@ class hdbscan_job(clustering_job): cluster_selection_method=self.cluster_selection_method) return self.result -class hdbscan_lsi_job(hdbscan_job, lsi_mixin): - def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs): - super().__init__( - infile, - outpath, - name, - *args, - **kwargs) - super().set_lsi_dims(lsi_dims) - - def get_info(self): - partial_result = super().get_info() - self.result = hdbscan_clustering_result_lsi(**partial_result.__dict__, - lsi_dimensions=self.lsi_dims) - return self.result - -# def select_hdbscan_clustering(inpath, -# outpath, -# outfile=None, -# min_cluster_sizes=[2], -# min_samples=[1], -# cluster_selection_epsilons=[0], -# cluster_selection_methods=['eom'], -# lsi_dimensions='all' -# ): - -# inpath = Path(inpath) -# outpath = Path(outpath) -# outpath.mkdir(exist_ok=True, parents=True) - -# if lsi_dimensions is None: -# lsi_paths = [inpath] -# elif lsi_dimensions == 'all': -# lsi_paths = list(inpath.glob("*")) - -# else: -# lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions] - -# if lsi_dimensions is not None: -# lsi_nums = [p.stem for p in lsi_paths] -# else: -# lsi_nums = [None] -# grid = list(product(lsi_nums, -# min_cluster_sizes, -# min_samples, -# cluster_selection_epsilons, -# cluster_selection_methods)) - -# # fix the output file names -# names = list(map(lambda t:'_'.join(map(str,t)),grid)) - -# grid = [(inpath/(str(t[0])+'.feather'),outpath/(name + '.feather'), t[0], name) + t[1:] for t, name in zip(grid, names)] - -# with Pool(int(cpu_count()/4)) as pool: -# mods = starmap(hdbscan_clustering, grid) - -# res = pd.DataFrame(mods) -# if outfile is None: -# outfile = outpath / "selection_data.csv" - -# res.to_csv(outfile) - -# def hdbscan_clustering(similarities, output, lsi_dim, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'): -# subreddits, mat = read_similarity_mat(similarities) -# mat = sim_to_dist(mat) -# clustering = _hdbscan_clustering(mat, -# min_cluster_size=min_cluster_size, -# min_samples=min_samples, -# cluster_selection_epsilon=cluster_selection_epsilon, -# cluster_selection_method=cluster_selection_method, -# metric='precomputed', -# core_dist_n_jobs=cpu_count() -# ) - -# cluster_data = process_clustering_result(clustering, subreddits) -# isolates = clustering.labels_ == -1 -# scoremat = mat[~isolates][:,~isolates] -# score = silhouette_score(scoremat, clustering.labels_[~isolates], metric='precomputed') -# cluster_data.to_feather(output) -# silhouette_samp = silhouette_samples(mat, clustering.labels_, metric='precomputed') -# silhouette_samp = pd.DataFrame({'subreddit':subreddits,'score':silhouette_samp}) -# silsampout = output.parent / ("silhouette_samples" + output.name) -# silhouette_samp.to_feather(silsampout) - -# result = hdbscan_clustering_result(outpath=output, -# silhouette_samples=silsampout, -# silhouette_score=score, -# name=name, -# min_cluster_size=min_cluster_size, -# min_samples=min_samples, -# cluster_selection_epsilon=cluster_selection_epsilon, -# cluster_selection_method=cluster_selection_method, -# lsi_dimensions=lsi_dim, -# n_isolates=isolates.sum(), -# n_clusters=len(set(clustering.labels_)) -# ) - - - -# return(result) - -# # for all runs we should try cluster_selection_epsilon = None -# # for terms we should try cluster_selection_epsilon around 0.56-0.66 -# # for authors we should try cluster_selection_epsilon around 0.98-0.99 -# def _hdbscan_clustering(mat, *args, **kwargs): -# print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}") - -# print(mat) -# clusterer = hdbscan.HDBSCAN(*args, -# **kwargs, -# ) - -# clustering = clusterer.fit(mat.astype('double')) +def run_hdbscan_grid_sweep(savefile, inpath, outpath, min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom']): + """Run hdbscan clustering once or more with different parameters. -# return(clustering) + Usage: + hdbscan_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --min_cluster_sizes= --min_samples= --cluster_selection_epsilons= --cluster_selection_methods= + + Keword arguments: + savefile: path to save the metadata and diagnostics + inpath: path to feather data containing a labeled matrix of subreddit similarities. + outpath: path to output fit kmeans clusterings. + min_cluster_sizes: one or more integers indicating the minumum cluster size + min_samples: one ore more integers indicating the minimum number of samples used in the algorithm + cluster_selection_epsilon: one or more similarity thresholds for transition from dbscan to hdbscan + cluster_selection_method: "eom" or "leaf" eom gives larger clusters. + """ + obj = hdbscan_grid_sweep(inpath, + outpath, + map(int,min_cluster_sizes), + map(int,min_samples), + map(float,cluster_selection_epsilons), + map(float,cluster_selection_methods)) + obj.run() + obj.save(savefile) def KNN_distances_plot(mat,outname,k=2): nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat) @@ -293,10 +153,7 @@ def make_KNN_plots(): KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png') if __name__ == "__main__": - fire.Fire{'grid_sweep':hdbscan_grid_sweep, - 'grid_sweep_lsi':hdbscan_lsi_grid_sweep - 'cluster':hdbscan_job, - 'cluster_lsi':hdbscan_lsi_job} + fire.Fire(run_hdbscan_grid_sweep) # test_select_hdbscan_clustering() #fire.Fire(select_hdbscan_clustering)