1 from pathlib import Path
4 from dataclasses import dataclass
9 np.fill_diagonal(dist,0)
12 def process_clustering_result(clustering, subreddits):
14 if hasattr(clustering,'n_iter_'):
15 print(f"clustering took {clustering.n_iter_} iterations")
17 clusters = clustering.labels_
19 print(f"found {len(set(clusters))} clusters")
21 cluster_data = pd.DataFrame({'subreddit': subreddits,'cluster':clustering.labels_})
23 cluster_sizes = cluster_data.groupby("cluster").count().reset_index()
24 print(f"the largest cluster has {cluster_sizes.loc[cluster_sizes.cluster!=-1].subreddit.max()} members")
26 print(f"the median cluster has {cluster_sizes.subreddit.median()} members")
28 print(f"{(cluster_sizes.subreddit==1).sum()} clusters have 1 member")
30 print(f"{(cluster_sizes.loc[cluster_sizes.cluster==-1,['subreddit']])} subreddits are in cluster -1",flush=True)
36 class clustering_result:
39 silhouette_score:float
40 alt_silhouette_score:float
44 def read_similarity_mat(similarities, use_threads=True):
45 df = pd.read_feather(similarities, use_threads=use_threads)
46 mat = np.array(df.drop('_subreddit',1))
48 mat[range(n),range(n)] = 1
49 return (df._subreddit,mat)