refactor clustering.py into method-specific files.

[cdsc_reddit.git] / clustering / hdbscan_clustering.py
diff --git a/clustering/hdbscan_clustering.py b/clustering/hdbscan_clustering.py

index 888554a55255a69e1c883bffa6ee4035a11a845d..4f4e0d6f2c4f18b47d3d96ac0991fbc72fdb6aef 100644 (file)
--- a/clustering/hdbscan_clustering.py
+++ b/clustering/hdbscan_clustering.py
@@ -28,6 +28,13 @@ def test_select_hdbscan_clustering():
      cluster_selection_methods=['eom'];
      lsi_dimensions='all'
  
      cluster_selection_methods=['eom'];
      lsi_dimensions='all'
  
+    df = pd.read_csv("test_hdbscan/selection_data.csv")
+    test_select_hdbscan_clustering()
+    check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather")
+    silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather")
+    c = check_clusters.merge(silscores,on='subreddit')#    fire.Fire(select_hdbscan_clustering)
+
+
  @dataclass
  class hdbscan_clustering_result(clustering_result):
      min_cluster_size:int
  @dataclass
  class hdbscan_clustering_result(clustering_result):
      min_cluster_size:int
@@ -165,8 +172,4 @@ def make_KNN_plots():
      KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png')
  
  if __name__ == "__main__":
      KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png')
  
  if __name__ == "__main__":
-    df = pd.read_csv("test_hdbscan/selection_data.csv")
-    test_select_hdbscan_clustering()
-    check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather")
-    silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather")
-    c = check_clusters.merge(silscores,on='subreddit')#    fire.Fire(select_hdbscan_clustering)
+    fire.Fire(select_hdbscan_clustering)