-
- print(f"clustering took {clustering.n_iter_} iterations")
- clusters = clustering.labels_
-
- print(f"found {len(set(clusters))} clusters")
-
- cluster_data = pd.DataFrame({'subreddit': subreddits,'cluster':clustering.labels_})
-
- cluster_sizes = cluster_data.groupby("cluster").count()
- print(f"the largest cluster has {cluster_sizes.subreddit.max()} members")
-
- print(f"the median cluster has {cluster_sizes.subreddit.median()} members")
-
- print(f"{(cluster_sizes.subreddit==1).sum()} clusters have 1 member")
-
- sys.stdout.flush()
+ cluster_data = process_clustering_result(clustering, subreddits)
+ output = Path(output)
+ output.parent.mkdir(parents=True,exist_ok=True)