-class hdbscan_lsi_job(hdbscan_job, lsi_mixin):
- def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
- super().__init__(
- infile,
- outpath,
- name,
- *args,
- **kwargs)
- super().set_lsi_dims(lsi_dims)
-
- def get_info(self):
- partial_result = super().get_info()
- self.result = hdbscan_clustering_result_lsi(**partial_result.__dict__,
- lsi_dimensions=self.lsi_dims)
- return self.result
-
-# def select_hdbscan_clustering(inpath,
-# outpath,
-# outfile=None,
-# min_cluster_sizes=[2],
-# min_samples=[1],
-# cluster_selection_epsilons=[0],
-# cluster_selection_methods=['eom'],
-# lsi_dimensions='all'
-# ):
-
-# inpath = Path(inpath)
-# outpath = Path(outpath)
-# outpath.mkdir(exist_ok=True, parents=True)
-
-# if lsi_dimensions is None:
-# lsi_paths = [inpath]
-# elif lsi_dimensions == 'all':
-# lsi_paths = list(inpath.glob("*"))
-
-# else:
-# lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions]
-
-# if lsi_dimensions is not None:
-# lsi_nums = [p.stem for p in lsi_paths]
-# else:
-# lsi_nums = [None]
-# grid = list(product(lsi_nums,
-# min_cluster_sizes,
-# min_samples,
-# cluster_selection_epsilons,
-# cluster_selection_methods))
-
-# # fix the output file names
-# names = list(map(lambda t:'_'.join(map(str,t)),grid))
-
-# grid = [(inpath/(str(t[0])+'.feather'),outpath/(name + '.feather'), t[0], name) + t[1:] for t, name in zip(grid, names)]
-
-# with Pool(int(cpu_count()/4)) as pool:
-# mods = starmap(hdbscan_clustering, grid)
-
-# res = pd.DataFrame(mods)
-# if outfile is None:
-# outfile = outpath / "selection_data.csv"
-
-# res.to_csv(outfile)
-
-# def hdbscan_clustering(similarities, output, lsi_dim, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
-# subreddits, mat = read_similarity_mat(similarities)
-# mat = sim_to_dist(mat)
-# clustering = _hdbscan_clustering(mat,
-# min_cluster_size=min_cluster_size,
-# min_samples=min_samples,
-# cluster_selection_epsilon=cluster_selection_epsilon,
-# cluster_selection_method=cluster_selection_method,
-# metric='precomputed',
-# core_dist_n_jobs=cpu_count()
-# )
-
-# cluster_data = process_clustering_result(clustering, subreddits)
-# isolates = clustering.labels_ == -1
-# scoremat = mat[~isolates][:,~isolates]
-# score = silhouette_score(scoremat, clustering.labels_[~isolates], metric='precomputed')
-# cluster_data.to_feather(output)
-# silhouette_samp = silhouette_samples(mat, clustering.labels_, metric='precomputed')
-# silhouette_samp = pd.DataFrame({'subreddit':subreddits,'score':silhouette_samp})
-# silsampout = output.parent / ("silhouette_samples" + output.name)
-# silhouette_samp.to_feather(silsampout)
-
-# result = hdbscan_clustering_result(outpath=output,
-# silhouette_samples=silsampout,
-# silhouette_score=score,
-# name=name,
-# min_cluster_size=min_cluster_size,
-# min_samples=min_samples,
-# cluster_selection_epsilon=cluster_selection_epsilon,
-# cluster_selection_method=cluster_selection_method,
-# lsi_dimensions=lsi_dim,
-# n_isolates=isolates.sum(),
-# n_clusters=len(set(clustering.labels_))
-# )
-
-
-
-# return(result)
-
-# # for all runs we should try cluster_selection_epsilon = None
-# # for terms we should try cluster_selection_epsilon around 0.56-0.66
-# # for authors we should try cluster_selection_epsilon around 0.98-0.99
-# def _hdbscan_clustering(mat, *args, **kwargs):
-# print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}")
-
-# print(mat)
-# clusterer = hdbscan.HDBSCAN(*args,
-# **kwargs,
-# )
-
-# clustering = clusterer.fit(mat.astype('double'))