X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/blobdiff_plain/7df8436067dba9a9e6867424002d01593e4bcd25..8d1df5b26ee80fee639e5b3ecd057fe8e72f166c:/clustering/hdbscan_clustering.py diff --git a/clustering/hdbscan_clustering.py b/clustering/hdbscan_clustering.py index 888554a..4f4e0d6 100644 --- a/clustering/hdbscan_clustering.py +++ b/clustering/hdbscan_clustering.py @@ -28,6 +28,13 @@ def test_select_hdbscan_clustering(): cluster_selection_methods=['eom']; lsi_dimensions='all' + df = pd.read_csv("test_hdbscan/selection_data.csv") + test_select_hdbscan_clustering() + check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather") + silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather") + c = check_clusters.merge(silscores,on='subreddit')# fire.Fire(select_hdbscan_clustering) + + @dataclass class hdbscan_clustering_result(clustering_result): min_cluster_size:int @@ -165,8 +172,4 @@ def make_KNN_plots(): KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png') if __name__ == "__main__": - df = pd.read_csv("test_hdbscan/selection_data.csv") - test_select_hdbscan_clustering() - check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather") - silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather") - c = check_clusters.merge(silscores,on='subreddit')# fire.Fire(select_hdbscan_clustering) + fire.Fire(select_hdbscan_clustering)