- res.to_csv(outfile)
-
-def hdbscan_clustering(similarities, output, lsi_dim, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
- subreddits, mat = read_similarity_mat(similarities)
- mat = sim_to_dist(mat)
- clustering = _hdbscan_clustering(mat,
- min_cluster_size=min_cluster_size,
- min_samples=min_samples,
- cluster_selection_epsilon=cluster_selection_epsilon,
- cluster_selection_method=cluster_selection_method,
- metric='precomputed',
- core_dist_n_jobs=cpu_count()
- )
-
- cluster_data = process_clustering_result(clustering, subreddits)
- isolates = clustering.labels_ == -1
- scoremat = mat[~isolates][:,~isolates]
- score = silhouette_score(scoremat, clustering.labels_[~isolates], metric='precomputed')
- cluster_data.to_feather(output)
-
- silhouette_samp = silhouette_samples(mat, clustering.labels_, metric='precomputed')
- silhouette_samp = pd.DataFrame({'subreddit':subreddits,'score':silhouette_samp})
- silsampout = output.parent / ("silhouette_samples" + output.name)
- silhouette_samp.to_feather(silsampout)
-
- result = hdbscan_clustering_result(outpath=output,
- max_iter=None,
- silhouette_samples=silsampout,
- silhouette_score=score,
- alt_silhouette_score=score,
- name=name,
- min_cluster_size=min_cluster_size,
- min_samples=min_samples,
- cluster_selection_epsilon=cluster_selection_epsilon,
- cluster_selection_method=cluster_selection_method,
- lsi_dimensions=lsi_dim,
- n_isolates=isolates.sum(),
- n_clusters=len(set(clustering.labels_))
- )
-
-
-
- return(result)
-
-# for all runs we should try cluster_selection_epsilon = None
-# for terms we should try cluster_selection_epsilon around 0.56-0.66
-# for authors we should try cluster_selection_epsilon around 0.98-0.99
-def _hdbscan_clustering(mat, *args, **kwargs):
- print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}")
-
- print(mat)
- clusterer = hdbscan.HDBSCAN(*args,
- **kwargs,
- )
+class hdbscan_job(clustering_job):
+ def __init__(self, infile, outpath, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
+ super().__init__(infile,
+ outpath,
+ name,
+ call=hdbscan_job._hdbscan_clustering,
+ min_cluster_size=min_cluster_size,
+ min_samples=min_samples,
+ cluster_selection_epsilon=cluster_selection_epsilon,
+ cluster_selection_method=cluster_selection_method
+ )
+
+ self.min_cluster_size = min_cluster_size
+ self.min_samples = min_samples
+ self.cluster_selection_epsilon = cluster_selection_epsilon
+ self.cluster_selection_method = cluster_selection_method
+# self.mat = 1 - self.mat
+
+ def _hdbscan_clustering(mat, *args, **kwargs):
+ print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}")
+ print(mat)
+ clusterer = hdbscan.HDBSCAN(metric='precomputed',
+ core_dist_n_jobs=cpu_count(),
+ *args,
+ **kwargs,
+ )