-def hdbscan_clustering(similarities, output, lsi_dim, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
- subreddits, mat = read_similarity_mat(similarities)
- mat = sim_to_dist(mat)
- clustering = _hdbscan_clustering(mat,
- min_cluster_size=min_cluster_size,
- min_samples=min_samples,
- cluster_selection_epsilon=cluster_selection_epsilon,
- cluster_selection_method=cluster_selection_method,
- metric='precomputed',
- core_dist_n_jobs=cpu_count()
- )
-
- cluster_data = process_clustering_result(clustering, subreddits)
- isolates = clustering.labels_ == -1
- scoremat = mat[~isolates][:,~isolates]
- score = silhouette_score(scoremat, clustering.labels_[~isolates], metric='precomputed')
- cluster_data.to_feather(output)
-
- silhouette_samp = silhouette_samples(mat, clustering.labels_, metric='precomputed')
- silhouette_samp = pd.DataFrame({'subreddit':subreddits,'score':silhouette_samp})
- silsampout = output.parent / ("silhouette_samples" + output.name)
- silhouette_samp.to_feather(silsampout)
-
- result = hdbscan_clustering_result(outpath=output,
- max_iter=None,
- silhouette_samples=silsampout,
- silhouette_score=score,
- alt_silhouette_score=score,
- name=name,
- min_cluster_size=min_cluster_size,
- min_samples=min_samples,
- cluster_selection_epsilon=cluster_selection_epsilon,
- cluster_selection_method=cluster_selection_method,
- lsi_dimensions=lsi_dim,
- n_isolates=isolates.sum(),
- n_clusters=len(set(clustering.labels_))
- )
+# def hdbscan_clustering(similarities, output, lsi_dim, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
+# subreddits, mat = read_similarity_mat(similarities)
+# mat = sim_to_dist(mat)
+# clustering = _hdbscan_clustering(mat,
+# min_cluster_size=min_cluster_size,
+# min_samples=min_samples,
+# cluster_selection_epsilon=cluster_selection_epsilon,
+# cluster_selection_method=cluster_selection_method,
+# metric='precomputed',
+# core_dist_n_jobs=cpu_count()
+# )
+
+# cluster_data = process_clustering_result(clustering, subreddits)
+# isolates = clustering.labels_ == -1
+# scoremat = mat[~isolates][:,~isolates]
+# score = silhouette_score(scoremat, clustering.labels_[~isolates], metric='precomputed')
+# cluster_data.to_feather(output)
+# silhouette_samp = silhouette_samples(mat, clustering.labels_, metric='precomputed')
+# silhouette_samp = pd.DataFrame({'subreddit':subreddits,'score':silhouette_samp})
+# silsampout = output.parent / ("silhouette_samples" + output.name)
+# silhouette_samp.to_feather(silsampout)
+
+# result = hdbscan_clustering_result(outpath=output,
+# silhouette_samples=silsampout,
+# silhouette_score=score,
+# name=name,
+# min_cluster_size=min_cluster_size,
+# min_samples=min_samples,
+# cluster_selection_epsilon=cluster_selection_epsilon,
+# cluster_selection_method=cluster_selection_method,
+# lsi_dimensions=lsi_dim,
+# n_isolates=isolates.sum(),
+# n_clusters=len(set(clustering.labels_))
+# )