+
+@dataclass
+class hdbscan_clustering_result_lsi(hdbscan_clustering_result, lsi_result_mixin):
+ pass
+
+class hdbscan_job(clustering_job):
+ def __init__(self, infile, outpath, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
+ super().__init__(infile,
+ outpath,
+ name,
+ call=hdbscan_job._hdbscan_clustering,
+ min_cluster_size=min_cluster_size,
+ min_samples=min_samples,
+ cluster_selection_epsilon=cluster_selection_epsilon,
+ cluster_selection_method=cluster_selection_method
+ )
+
+ self.min_cluster_size = min_cluster_size
+ self.min_samples = min_samples
+ self.cluster_selection_epsilon = cluster_selection_epsilon
+ self.cluster_selection_method = cluster_selection_method
+# self.mat = 1 - self.mat
+
+ def _hdbscan_clustering(mat, *args, **kwargs):
+ print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}")
+ print(mat)
+ clusterer = hdbscan.HDBSCAN(metric='precomputed',
+ core_dist_n_jobs=cpu_count(),
+ *args,
+ **kwargs,
+ )
+
+ clustering = clusterer.fit(mat.astype('double'))
+
+ return(clustering)
+
+ def get_info(self):
+ result = super().get_info()
+ self.result = hdbscan_clustering_result(**result.__dict__,
+ min_cluster_size=self.min_cluster_size,
+ min_samples=self.min_samples,
+ cluster_selection_epsilon=self.cluster_selection_epsilon,
+ cluster_selection_method=self.cluster_selection_method)
+ return self.result
+
+class hdbscan_lsi_job(hdbscan_job, lsi_mixin):
+ def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
+ super().__init__(
+ infile,
+ outpath,
+ name,
+ *args,
+ **kwargs)
+ super().set_lsi_dims(lsi_dims)
+
+ def get_info(self):
+ partial_result = super().get_info()
+ self.result = hdbscan_clustering_result_lsi(**partial_result.__dict__,
+ lsi_dimensions=self.lsi_dims)
+ return self.result
+
+# def select_hdbscan_clustering(inpath,
+# outpath,
+# outfile=None,
+# min_cluster_sizes=[2],
+# min_samples=[1],
+# cluster_selection_epsilons=[0],
+# cluster_selection_methods=['eom'],
+# lsi_dimensions='all'
+# ):
+
+# inpath = Path(inpath)
+# outpath = Path(outpath)
+# outpath.mkdir(exist_ok=True, parents=True)