X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/blobdiff_plain/36b24ee933b95424686cfeaa2b2bd9776f23f853..7df8436067dba9a9e6867424002d01593e4bcd25:/clustering/clustering_base.py?ds=inline diff --git a/clustering/clustering_base.py b/clustering/clustering_base.py new file mode 100644 index 0000000..1d86438 --- /dev/null +++ b/clustering/clustering_base.py @@ -0,0 +1,49 @@ +from pathlib import Path +import numpy as np +import pandas as pd +from dataclasses import dataclass + +def sim_to_dist(mat): + dist = 1-mat + dist[dist < 0] = 0 + np.fill_diagonal(dist,0) + return dist + +def process_clustering_result(clustering, subreddits): + + if hasattr(clustering,'n_iter_'): + print(f"clustering took {clustering.n_iter_} iterations") + + clusters = clustering.labels_ + + print(f"found {len(set(clusters))} clusters") + + cluster_data = pd.DataFrame({'subreddit': subreddits,'cluster':clustering.labels_}) + + cluster_sizes = cluster_data.groupby("cluster").count().reset_index() + print(f"the largest cluster has {cluster_sizes.loc[cluster_sizes.cluster!=-1].subreddit.max()} members") + + print(f"the median cluster has {cluster_sizes.subreddit.median()} members") + + print(f"{(cluster_sizes.subreddit==1).sum()} clusters have 1 member") + + print(f"{(cluster_sizes.loc[cluster_sizes.cluster==-1,['subreddit']])} subreddits are in cluster -1",flush=True) + + return cluster_data + + +@dataclass +class clustering_result: + outpath:Path + max_iter:int + silhouette_score:float + alt_silhouette_score:float + name:str + n_clusters:int + +def read_similarity_mat(similarities, use_threads=True): + df = pd.read_feather(similarities, use_threads=use_threads) + mat = np.array(df.drop('_subreddit',1)) + n = mat.shape[0] + mat[range(n),range(n)] = 1 + return (df._subreddit,mat)