-from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat
-from clustering_base import lsi_result_mixin, lsi_mixin, clustering_job, grid_sweep, lsi_grid_sweep
+from clustering_base import clustering_result, clustering_job
+from grid_sweep import grid_sweep
from dataclasses import dataclass
import hdbscan
from sklearn.neighbors import NearestNeighbors
import numpy as np
from itertools import product, starmap, chain
import pandas as pd
-from sklearn.metrics import silhouette_score, silhouette_samples
-from pathlib import Path
-from multiprocessing import Pool, cpu_count
+from multiprocessing import cpu_count
import fire
-from pyarrow.feather import write_feather
def test_select_hdbscan_clustering():
# select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
# check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather")
# silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather")
# c = check_clusters.merge(silscores,on='subreddit')# fire.Fire(select_hdbscan_clustering)
-
-class hdbscan_lsi_grid_sweep(lsi_grid_sweep):
- def __init__(self,
- inpath,
- lsi_dims,
- outpath,
- min_cluster_sizes,
- min_samples,
- cluster_selection_epsilons,
- cluster_selection_methods
- ):
-
- super().__init__(hdbscan_lsi_job,
- _hdbscan_lsi_grid_sweep,
- inpath,
- lsi_dims,
- outpath,
- min_cluster_sizes,
- min_samples,
- cluster_selection_epsilons,
- cluster_selection_methods)
-
class hdbscan_grid_sweep(grid_sweep):
def __init__(self,
inpath,
cluster_selection_method):
return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}"
-
-class _hdbscan_lsi_grid_sweep(grid_sweep):
- def __init__(self,
- inpath,
- outpath,
- lsi_dim,
- *args,
- **kwargs):
-
- self.lsi_dim = lsi_dim
- self.jobtype = hdbscan_lsi_job
- super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs)
-
-
- def namer(self, *args, **kwargs):
- s = hdbscan_grid_sweep.namer(self, *args[1:], **kwargs)
- s += f"_lsi-{self.lsi_dim}"
- return s
-
@dataclass
class hdbscan_clustering_result(clustering_result):
min_cluster_size:int
cluster_selection_epsilon:float
cluster_selection_method:str
-@dataclass
-class hdbscan_clustering_result_lsi(hdbscan_clustering_result, lsi_result_mixin):
- pass
-
class hdbscan_job(clustering_job):
def __init__(self, infile, outpath, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
super().__init__(infile,
cluster_selection_method=self.cluster_selection_method)
return self.result
-class hdbscan_lsi_job(hdbscan_job, lsi_mixin):
- def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
- super().__init__(
- infile,
- outpath,
- name,
- *args,
- **kwargs)
- super().set_lsi_dims(lsi_dims)
-
- def get_info(self):
- partial_result = super().get_info()
- self.result = hdbscan_clustering_result_lsi(**partial_result.__dict__,
- lsi_dimensions=self.lsi_dims)
- return self.result
-
-# def select_hdbscan_clustering(inpath,
-# outpath,
-# outfile=None,
-# min_cluster_sizes=[2],
-# min_samples=[1],
-# cluster_selection_epsilons=[0],
-# cluster_selection_methods=['eom'],
-# lsi_dimensions='all'
-# ):
-
-# inpath = Path(inpath)
-# outpath = Path(outpath)
-# outpath.mkdir(exist_ok=True, parents=True)
-
-# if lsi_dimensions is None:
-# lsi_paths = [inpath]
-# elif lsi_dimensions == 'all':
-# lsi_paths = list(inpath.glob("*"))
-
-# else:
-# lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions]
-
-# if lsi_dimensions is not None:
-# lsi_nums = [p.stem for p in lsi_paths]
-# else:
-# lsi_nums = [None]
-# grid = list(product(lsi_nums,
-# min_cluster_sizes,
-# min_samples,
-# cluster_selection_epsilons,
-# cluster_selection_methods))
-
-# # fix the output file names
-# names = list(map(lambda t:'_'.join(map(str,t)),grid))
-
-# grid = [(inpath/(str(t[0])+'.feather'),outpath/(name + '.feather'), t[0], name) + t[1:] for t, name in zip(grid, names)]
-
-# with Pool(int(cpu_count()/4)) as pool:
-# mods = starmap(hdbscan_clustering, grid)
-
-# res = pd.DataFrame(mods)
-# if outfile is None:
-# outfile = outpath / "selection_data.csv"
-
-# res.to_csv(outfile)
-
-# def hdbscan_clustering(similarities, output, lsi_dim, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
-# subreddits, mat = read_similarity_mat(similarities)
-# mat = sim_to_dist(mat)
-# clustering = _hdbscan_clustering(mat,
-# min_cluster_size=min_cluster_size,
-# min_samples=min_samples,
-# cluster_selection_epsilon=cluster_selection_epsilon,
-# cluster_selection_method=cluster_selection_method,
-# metric='precomputed',
-# core_dist_n_jobs=cpu_count()
-# )
-
-# cluster_data = process_clustering_result(clustering, subreddits)
-# isolates = clustering.labels_ == -1
-# scoremat = mat[~isolates][:,~isolates]
-# score = silhouette_score(scoremat, clustering.labels_[~isolates], metric='precomputed')
-# cluster_data.to_feather(output)
-# silhouette_samp = silhouette_samples(mat, clustering.labels_, metric='precomputed')
-# silhouette_samp = pd.DataFrame({'subreddit':subreddits,'score':silhouette_samp})
-# silsampout = output.parent / ("silhouette_samples" + output.name)
-# silhouette_samp.to_feather(silsampout)
-
-# result = hdbscan_clustering_result(outpath=output,
-# silhouette_samples=silsampout,
-# silhouette_score=score,
-# name=name,
-# min_cluster_size=min_cluster_size,
-# min_samples=min_samples,
-# cluster_selection_epsilon=cluster_selection_epsilon,
-# cluster_selection_method=cluster_selection_method,
-# lsi_dimensions=lsi_dim,
-# n_isolates=isolates.sum(),
-# n_clusters=len(set(clustering.labels_))
-# )
-
-
-
-# return(result)
-
-# # for all runs we should try cluster_selection_epsilon = None
-# # for terms we should try cluster_selection_epsilon around 0.56-0.66
-# # for authors we should try cluster_selection_epsilon around 0.98-0.99
-# def _hdbscan_clustering(mat, *args, **kwargs):
-# print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}")
-
-# print(mat)
-# clusterer = hdbscan.HDBSCAN(*args,
-# **kwargs,
-# )
-
-# clustering = clusterer.fit(mat.astype('double'))
+def run_hdbscan_grid_sweep(savefile, inpath, outpath, min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom']):
+ """Run hdbscan clustering once or more with different parameters.
-# return(clustering)
+ Usage:
+ hdbscan_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --min_cluster_sizes=<csv> --min_samples=<csv> --cluster_selection_epsilons=<csv> --cluster_selection_methods=<csv "eom"|"leaf">
+
+ Keword arguments:
+ savefile: path to save the metadata and diagnostics
+ inpath: path to feather data containing a labeled matrix of subreddit similarities.
+ outpath: path to output fit kmeans clusterings.
+ min_cluster_sizes: one or more integers indicating the minumum cluster size
+ min_samples: one ore more integers indicating the minimum number of samples used in the algorithm
+ cluster_selection_epsilon: one or more similarity thresholds for transition from dbscan to hdbscan
+ cluster_selection_method: "eom" or "leaf" eom gives larger clusters.
+ """
+ obj = hdbscan_grid_sweep(inpath,
+ outpath,
+ map(int,min_cluster_sizes),
+ map(int,min_samples),
+ map(float,cluster_selection_epsilons),
+ map(float,cluster_selection_methods))
+ obj.run()
+ obj.save(savefile)
def KNN_distances_plot(mat,outname,k=2):
nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat)
KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png')
if __name__ == "__main__":
- fire.Fire{'grid_sweep':hdbscan_grid_sweep,
- 'grid_sweep_lsi':hdbscan_lsi_grid_sweep
- 'cluster':hdbscan_job,
- 'cluster_lsi':hdbscan_lsi_job}
+ fire.Fire(run_hdbscan_grid_sweep)
# test_select_hdbscan_clustering()
#fire.Fire(select_hdbscan_clustering)